diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1372,61 +1372,48 @@ >; } + /********** ================================ **********/ /********** Floating point absolute/negative **********/ /********** ================================ **********/ -// Prevent expanding both fneg and fabs. -// TODO: Add IgnoredBySelectionDAG bit? -let AddedComplexity = 1 in { // Prefer SALU to VALU patterns for DAG - def : GCNPat < - (fneg (fabs (f32 SReg_32:$src))), + (UniformUnaryFrag (fabs (f32 SReg_32:$src))), (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit >; def : GCNPat < - (fabs (f32 SReg_32:$src)), + (UniformUnaryFrag (f32 SReg_32:$src)), (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff))) >; def : GCNPat < - (fneg (f32 SReg_32:$src)), + (UniformUnaryFrag (f32 SReg_32:$src)), (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) >; def : GCNPat < - (fneg (f16 SReg_32:$src)), + (UniformUnaryFrag (f16 SReg_32:$src)), (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) >; def : GCNPat < - (fneg (f16 VGPR_32:$src)), - (V_XOR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) ->; - -def : GCNPat < - (fabs (f16 SReg_32:$src)), + (UniformUnaryFrag (f16 SReg_32:$src)), (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff))) >; def : GCNPat < - (fneg (fabs (f16 SReg_32:$src))), + (UniformUnaryFrag (fabs (f16 SReg_32:$src))), (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit >; def : GCNPat < - (fneg (fabs (f16 VGPR_32:$src))), - (V_OR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit ->; - -def : GCNPat < - (fneg (v2f16 SReg_32:$src)), + (UniformUnaryFrag (v2f16 SReg_32:$src)), (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) >; def : GCNPat < - (fabs (v2f16 SReg_32:$src)), + (UniformUnaryFrag (v2f16 SReg_32:$src)), (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff))) >; @@ -1435,51 +1422,20 @@ // fabs is not reported as free because there is modifier for it in // VOP3P instructions, so it is turned into the bit op. def : GCNPat < - (fneg (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))), + (UniformUnaryFrag (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))), (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit >; def : GCNPat < - (fneg (v2f16 (fabs SReg_32:$src))), + (UniformUnaryFrag (v2f16 (fabs SReg_32:$src))), (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit >; -// FIXME: The implicit-def of scc from S_[X]OR/AND_B32 is mishandled - // def : GCNPat < -// (fneg (f64 SReg_64:$src)), -// (REG_SEQUENCE SReg_64, -// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), -// sub0, -// (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), -// (i32 (S_MOV_B32 (i32 0x80000000)))), -// sub1) -// >; - -// def : GCNPat < -// (fneg (fabs (f64 SReg_64:$src))), -// (REG_SEQUENCE SReg_64, -// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), -// sub0, -// (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), -// (S_MOV_B32 (i32 0x80000000))), // Set sign bit. -// sub1) -// >; - -// FIXME: Use S_BITSET0_B32/B64? -// def : GCNPat < -// (fabs (f64 SReg_64:$src)), -// (REG_SEQUENCE SReg_64, -// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), -// sub0, -// (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), -// (i32 (S_MOV_B32 (i32 0x7fffffff)))), -// sub1) -// >; // COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead // of the real value. def : GCNPat < - (fneg (v2f32 SReg_64:$src)), + (UniformUnaryFrag (v2f32 SReg_64:$src)), (v2f32 (REG_SEQUENCE SReg_64, (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), (i32 (S_MOV_B32 (i32 0x80000000)))), @@ -1489,7 +1445,64 @@ SReg_32)), sub1)) >; -} // End let AddedComplexity = 1 +def : GCNPat < + (UniformUnaryFrag (v2f32 SReg_64:$src)), + (v2f32 (REG_SEQUENCE SReg_64, + (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub0)), + (i32 (S_MOV_B32 (i32 0x7fffffff)))), + SReg_32)), sub0, + (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub1)), + (i32 (S_MOV_B32 (i32 0x7fffffff)))), + SReg_32)), sub1)) +>; + +def : GCNPat < + (UniformUnaryFrag (fabs (v2f32 SReg_64:$src))), + (v2f32 (REG_SEQUENCE SReg_64, + (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), + (i32 (S_MOV_B32 (i32 0x80000000)))), + SReg_32)), sub0, + (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), + (i32 (S_MOV_B32 (i32 0x80000000)))), + SReg_32)), sub1)) +>; + +// FIXME: Use S_BITSET0_B32/B64? +def : GCNPat < + (UniformUnaryFrag (f64 SReg_64:$src)), + (REG_SEQUENCE SReg_64, + (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), + sub0, + (i32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), + (S_MOV_B32 (i32 0x7fffffff))), SReg_32)), // Set sign bit. + sub1) +>; + +def : GCNPat < + (UniformUnaryFrag (f64 SReg_64:$src)), + (REG_SEQUENCE SReg_64, + (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), + sub0, + (i32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), + (i32 (S_MOV_B32 (i32 0x80000000)))), SReg_32)), + sub1) +>; + +def : GCNPat < + (UniformUnaryFrag (fabs (f64 SReg_64:$src))), + (REG_SEQUENCE SReg_64, + (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), + sub0, + (i32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), + (S_MOV_B32 (i32 0x80000000))), SReg_32)),// Set sign bit. + sub1) +>; + + +def : GCNPat < + (fneg (fabs (f32 VGPR_32:$src))), + (V_OR_B32_e32 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) // Set sign bit +>; def : GCNPat < (fabs (f32 VGPR_32:$src)), @@ -1506,6 +1519,16 @@ (V_AND_B32_e32 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src) >; +def : GCNPat < + (fneg (f16 VGPR_32:$src)), + (V_XOR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) +>; + +def : GCNPat < + (fneg (fabs (f16 VGPR_32:$src))), + (V_OR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit +>; + def : GCNPat < (fneg (v2f16 VGPR_32:$src)), (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) @@ -1518,7 +1541,7 @@ def : GCNPat < (fneg (v2f16 (fabs VGPR_32:$src))), - (V_OR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) // Set sign bit + (V_OR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) >; def : GCNPat < @@ -1526,30 +1549,28 @@ (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), sub0, - (V_AND_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), - (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit. + (V_AND_B32_e32 (i32 (S_MOV_B32 (i32 0x7fffffff))), + (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), sub1) >; -// TODO: Use SGPR for constant def : GCNPat < (fneg (f64 VReg_64:$src)), (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), sub0, - (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), - (i32 (V_MOV_B32_e32 (i32 0x80000000)))), + (V_XOR_B32_e32 (i32 (S_MOV_B32 (i32 0x80000000))), + (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), sub1) >; -// TODO: Use SGPR for constant def : GCNPat < (fneg (fabs (f64 VReg_64:$src))), (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), sub0, - (V_OR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), - (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit. + (V_OR_B32_e32 (i32 (S_MOV_B32 (i32 0x80000000))), + (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), sub1) >; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fabs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fabs.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fabs.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fabs.mir @@ -225,9 +225,9 @@ ; GCN-LABEL: name: fabs_s64_vv ; GCN: liveins: $vgpr0_vgpr1 ; GCN: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483647 + ; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 [[S_MOV_B32_]], [[COPY1]], implicit $exec ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_AND_B32_e64_]], %subreg.sub1 ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]] @@ -268,9 +268,9 @@ ; GCN-LABEL: name: fabs_s64_vv_no_src_constraint ; GCN: liveins: $vgpr0_vgpr1 ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1 - ; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483647 + ; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 [[S_MOV_B32_]], [[COPY]], implicit $exec ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0 ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[V_AND_B32_e64_]], %subreg.sub1 ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fneg.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fneg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fneg.mir @@ -225,9 +225,9 @@ ; GCN-LABEL: name: fneg_s64_vv ; GCN: liveins: $vgpr0_vgpr1 ; GCN: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483648, implicit $exec ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GCN: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GCN: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[COPY1]], implicit $exec ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_XOR_B32_e32_]], %subreg.sub1 ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]] @@ -493,9 +493,9 @@ ; GCN-LABEL: name: fneg_fabs_s64_vv ; GCN: liveins: $vgpr0_vgpr1 ; GCN: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483648, implicit $exec ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GCN: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GCN: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[S_MOV_B32_]], [[COPY1]], implicit $exec ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_OR_B32_e32_]], %subreg.sub1 ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]] @@ -518,9 +518,9 @@ ; GCN: liveins: $sgpr0_sgpr1 ; GCN: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 ; GCN: [[FABS:%[0-9]+]]:vreg_64(s64) = G_FABS [[COPY]] - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 2147483648, implicit $exec ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY [[FABS]].sub1(s64) - ; GCN: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32(s16) = V_XOR_B32_e32 [[COPY1]](s32), [[V_MOV_B32_e32_]](s32), implicit $exec + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32(s32) = S_MOV_B32 2147483648 + ; GCN: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32(s16) = V_XOR_B32_e32 [[S_MOV_B32_]](s32), [[COPY1]](s32), implicit $exec ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY [[FABS]].sub0(s64) ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64(s64) = REG_SEQUENCE [[COPY2]](s32), %subreg.sub0, [[V_XOR_B32_e32_]](s16), %subreg.sub1 ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]](s64) diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll @@ -21,8 +21,7 @@ } ; FUNC-LABEL: {{^}}fabs_f64: -; SI: v_and_b32 -; SI-NOT: v_and_b32 +; SI: s_bitset0_b32 ; SI: s_endpgm define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double %in) { %fabs = call double @llvm.fabs.f64(double %in) @@ -31,8 +30,8 @@ } ; FUNC-LABEL: {{^}}fabs_v2f64: -; SI: v_and_b32 -; SI: v_and_b32 +; SI: s_and_b32 +; SI: s_and_b32 ; SI: s_endpgm define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) @@ -41,10 +40,10 @@ } ; FUNC-LABEL: {{^}}fabs_v4f64: -; SI: v_and_b32 -; SI: v_and_b32 -; SI: v_and_b32 -; SI: v_and_b32 +; SI: s_and_b32 +; SI: s_and_b32 +; SI: s_and_b32 +; SI: s_and_b32 ; SI: s_endpgm define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) @@ -77,7 +76,7 @@ } ; FUNC-LABEL: {{^}}fabs_free_f64: -; SI: v_and_b32 +; SI: s_bitset0_b32 ; SI: s_endpgm define amdgpu_kernel void @fabs_free_f64(double addrspace(1)* %out, i64 %in) { %bc= bitcast i64 %in to double @@ -87,7 +86,7 @@ } ; FUNC-LABEL: {{^}}fabs_fn_free_f64: -; SI: v_and_b32 +; SI: s_bitset0_b32 ; SI: s_endpgm define amdgpu_kernel void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { %bc= bitcast i64 %in to double diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs-divergence-driven-isel.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs-divergence-driven-isel.ll @@ -0,0 +1,504 @@ +; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GCN,FP16 %s + + +define amdgpu_kernel void @divergent_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +; GCN-LABEL: name: divergent_fneg_f32 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 +; GCN: V_XOR_B32_e32 killed %[[REG]] + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %val = load volatile float, float addrspace(1)* %in.gep + %fneg = fneg float %val + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @uniform_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in, i64 %idx) { +; GCN-LABEL: name: uniform_fneg_f32 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 +; GCN: S_XOR_B32 killed %{{[0-9]+}}, killed %[[REG]] + + %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %idx + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %idx + %val = load volatile float, float addrspace(1)* %in.gep + %fneg = fneg float %val + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @divergent_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +; GCN-LABEL: name: divergent_fabs_f32 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647 +; GCN: V_AND_B32_e32 killed %[[REG]] + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %val = load volatile float, float addrspace(1)* %in.gep + %fabs = call float @llvm.fabs.f32(float %val) + store float %fabs, float addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @uniform_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in, i64 %idx) { +; GCN-LABEL: name: uniform_fabs_f32 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647 +; GCN: S_AND_B32 killed %{{[0-9]+}}, killed %[[REG]] + + %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %idx + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %idx + %val = load volatile float, float addrspace(1)* %in.gep + %fabs = call float @llvm.fabs.f32(float %val) + store float %fabs, float addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @divergent_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +; GCN-LABEL: name: divergent_fneg_fabs_f32 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 +; GCN: V_OR_B32_e32 killed %[[REG]] + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %val = load volatile float, float addrspace(1)* %in.gep + %fabs = call float @llvm.fabs.f32(float %val) + %fneg = fneg float %fabs + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @uniform_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in, i64 %idx) { +; GCN-LABEL: name: uniform_fneg_fabs_f32 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 +; GCN: S_OR_B32 killed %{{[0-9]+}}, killed %[[REG]] + + %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %idx + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %idx + %val = load volatile float, float addrspace(1)* %in.gep + %fabs = call float @llvm.fabs.f32(float %val) + %fneg = fneg float %fabs + store float %fneg, float addrspace(1)* %out.gep + ret void +} + + +define amdgpu_kernel void @divergent_fabs_f16(half addrspace(1)* %in, half addrspace(1)* %out) { +; GCN-LABEL: name: divergent_fabs_f16 +; GCN-LABEL: bb.0 (%ir-block.0) +; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32767 +; FP16: V_AND_B32_e32 killed %[[REG]] + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %tid.ext + %val = load volatile half, half addrspace(1)* %in.gep + %fabs = call half @llvm.fabs.f16(half %val) + store half %fabs, half addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @uniform_fabs_f16(half addrspace(1)* %in, half addrspace(1)* %out, i64 %idx) { +; GCN-LABEL: name: uniform_fabs_f16 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32767 +; GCN: S_AND_B32 killed %{{[0-9]+}}, killed %[[REG]] + + %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %idx + %val = load volatile half, half addrspace(1)* %in.gep + %fabs = call half @llvm.fabs.f16(half %val) + store half %fabs, half addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @divergent_fneg_f16(half addrspace(1)* %in, half addrspace(1)* %out) { +; GCN-LABEL: name: divergent_fneg_f16 +; GCN-LABEL: bb.0 (%ir-block.0) +; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32768 +; FP16: V_XOR_B32_e32 killed %[[REG]] + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %tid.ext + %val = load volatile half, half addrspace(1)* %in.gep + %fneg = fneg half %val + store half %fneg, half addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @uniform_fneg_f16(half addrspace(1)* %in, half addrspace(1)* %out, i64 %idx) { +; GCN-LABEL: name: uniform_fneg_f16 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32768 +; GCN: S_XOR_B32 killed %{{[0-9]+}}, killed %[[REG]] + + %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %idx + %val = load volatile half, half addrspace(1)* %in.gep + %fneg = fneg half %val + store half %fneg, half addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @divergent_fneg_fabs_f16(half addrspace(1)* %in, half addrspace(1)* %out) { +; GCN-LABEL: name: divergent_fneg_fabs_f16 +; GCN-LABEL: bb.0 (%ir-block.0) +; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32768 +; FP16: V_OR_B32_e32 killed %[[REG]] + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %tid.ext + %val = load volatile half, half addrspace(1)* %in.gep + %fabs = call half @llvm.fabs.f16(half %val) + %fneg = fneg half %fabs + store half %fneg, half addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @uniform_fneg_fabs_f16(half addrspace(1)* %in, half addrspace(1)* %out, i64 %idx) { +; GCN-LABEL: name: uniform_fneg_fabs_f16 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32768 +; GCN: S_OR_B32 killed %{{[0-9]+}}, killed %[[REG]] + + %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %idx + %val = load volatile half, half addrspace(1)* %in.gep + %fabs = call half @llvm.fabs.f16(half %val) + %fneg = fneg half %fabs + store half %fneg, half addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @divergent_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) { +; GCN-LABEL: name: divergent_fneg_v2f16 +; GCN-LABEL: bb.0 (%ir-block.0) +; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147450880 +; FP16: V_XOR_B32_e32 killed %[[REG]] + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2 + %fneg = fneg <2 x half> %val + store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out + ret void +} + +define amdgpu_kernel void @uniform_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 %idx) { +; GCN-LABEL: name: uniform_fneg_v2f16 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147450880 +; GCN: S_XOR_B32 killed %{{[0-9]+}}, killed %[[REG]] + + %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx + %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx + %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2 + %fneg = fneg <2 x half> %val + store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out + ret void +} + +define amdgpu_kernel void @divergent_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) { +; GCN-LABEL: name: divergent_fabs_v2f16 +; GCN-LABEL: bb.0 (%ir-block.0) +; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147450879 +; FP16: V_AND_B32_e32 killed %[[REG]] + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2 + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + store <2 x half> %fabs, <2 x half> addrspace(1)* %gep.out + ret void +} + +define amdgpu_kernel void @uniform_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 %idx) { +; GCN-LABEL: name: uniform_fabs_v2f16 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147450879 +; GCN: S_AND_B32 killed %{{[0-9]+}}, killed %[[REG]] + + %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx + %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx + %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2 + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + store <2 x half> %fabs, <2 x half> addrspace(1)* %gep.out + ret void +} + +define amdgpu_kernel void @divergent_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) { +; GCN-LABEL: name: divergent_fneg_fabs_v2f16 +; GCN-LABEL: bb.0 (%ir-block.0) +; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147450880 +; FP16: V_OR_B32_e32 killed %[[REG]] + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2 + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + %fneg = fneg <2 x half> %fabs + store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out + ret void +} + +define amdgpu_kernel void @uniform_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 %idx) { +; GCN-LABEL: name: uniform_fneg_fabs_v2f16 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147450880 +; GCN: S_OR_B32 killed %{{[0-9]+}}, killed %[[REG]] + + %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx + %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx + %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2 + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + %fneg = fneg <2 x half> %fabs + store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out + ret void +} + +define amdgpu_kernel void @divergent_fneg_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { +; GCN-LABEL: name: divergent_fneg_v2f32 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 +; GCN: V_XOR_B32_e32 %[[REG]] +; GCN: V_XOR_B32_e32 %[[REG]] + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid + %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid + %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4 + %fneg = fneg <2 x float> %val + store <2 x float> %fneg, <2 x float> addrspace(1)* %gep.out + ret void +} + +define amdgpu_kernel void @uniform_fneg_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in, i32 %idx) { +; GCN-LABEL: name: uniform_fneg_v2f32 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 +; GCN: S_XOR_B32 killed %{{[0-9]+}}, %[[REG]] +; GCN: S_XOR_B32 killed %{{[0-9]+}}, %[[REG]] + + %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx + %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx + %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4 + %fneg = fneg <2 x float> %val + store <2 x float> %fneg, <2 x float> addrspace(1)* %gep.out + ret void +} + +define amdgpu_kernel void @divergent_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { +; GCN-LABEL: name: divergent_fabs_v2f32 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647 +; GCN: V_AND_B32_e32 %[[REG]] +; GCN: V_AND_B32_e32 %[[REG]] + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid + %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid + %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4 + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %val) + store <2 x float> %fabs, <2 x float> addrspace(1)* %gep.out + ret void +} + +define amdgpu_kernel void @uniform_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in, i32 %idx) { +; GCN-LABEL: name: uniform_fabs_v2f32 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647 +; GCN: S_AND_B32 killed %{{[0-9]+}}, %[[REG]] +; GCN: S_AND_B32 killed %{{[0-9]+}}, %[[REG]] + + %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx + %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx + %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4 + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %val) + store <2 x float> %fabs, <2 x float> addrspace(1)* %gep.out + ret void +} + +define amdgpu_kernel void @divergent_fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { +; GCN-LABEL: name: divergent_fneg_fabs_v2f32 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 +; GCN: V_OR_B32_e32 %[[REG]] +; GCN: V_OR_B32_e32 %[[REG]] + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid + %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid + %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4 + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %val) + %fneg = fneg <2 x float> %fabs + store <2 x float> %fneg, <2 x float> addrspace(1)* %gep.out + ret void +} + +define amdgpu_kernel void @uniform_fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in, i32 %idx) { +; GCN-LABEL: name: uniform_fneg_fabs_v2f32 +; GCN-LABEL: bb.0 (%ir-block.0) +; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 +; GCN: S_OR_B32 killed %{{[0-9]+}}, %[[REG]] +; GCN: S_OR_B32 killed %{{[0-9]+}}, %[[REG]] + + %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx + %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx + %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4 + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %val) + %fneg = fneg <2 x float> %fabs + store <2 x float> %fneg, <2 x float> addrspace(1)* %gep.out + ret void +} + +define amdgpu_kernel void @divergent_fneg_f64(double addrspace(1)* %out, double addrspace(1)* %in) { +; GCN-LABEL: name: divergent_fneg_f64 +; GCN-LABEL: bb.0 (%ir-block.0) +; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 +; FP16: %[[VREG64:[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR +; GCN: %[[HI32:[0-9]+]]:vgpr_32 = COPY %[[VREG64]].sub1 +; GCN: %[[SREG_MASK:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 +; GCN: %[[XOR:[0-9]+]]:vgpr_32 = V_XOR_B32_e32 killed %[[SREG_MASK]], killed %[[HI32]] +; GCN: %[[LO32:[0-9]+]]:vgpr_32 = COPY %[[VREG64]].sub0 +; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[XOR]], %subreg.sub1 + + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext + %val = load volatile double, double addrspace(1)* %in.gep + %fneg = fneg double %val + store double %fneg, double addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @uniform_fneg_f64(double addrspace(1)* %out, double addrspace(1)* %in, i64 %idx) { +; GCN-LABEL: name: uniform_fneg_f64 +; GCN-LABEL: bb.0 (%ir-block.0) +; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 +; FP16: %[[VREG64:[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR +; GCN: %[[LO32:[0-9]+]]:sreg_32 = COPY %[[VREG64]].sub0 +; GCN: %[[HI32:[0-9]+]]:sreg_32 = COPY %[[VREG64]].sub1 +; GCN: %[[SREG_MASK:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 +; GCN: %[[XOR:[0-9]+]]:sreg_32 = S_XOR_B32 killed %[[HI32]], killed %[[SREG_MASK]] +; GCN: %[[XOR_COPY:[0-9]+]]:sreg_32 = COPY %[[XOR]] +; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[XOR_COPY]], %subreg.sub1 + + %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %idx + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %idx + %val = load volatile double, double addrspace(1)* %in.gep + %fneg = fneg double %val + store double %fneg, double addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @divergent_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) { +; GCN-LABEL: name: divergent_fabs_f64 +; GCN-LABEL: bb.0 (%ir-block.0) +; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 +; FP16: %[[VREG64:[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR +; GCN: %[[HI32:[0-9]+]]:vgpr_32 = COPY %[[VREG64]].sub1 +; GCN: %[[SREG_MASK:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647 +; GCN: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e32 killed %[[SREG_MASK]], killed %[[HI32]] +; GCN: %[[LO32:[0-9]+]]:vgpr_32 = COPY %[[VREG64]].sub0 +; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[AND]], %subreg.sub1 + + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext + %val = load volatile double, double addrspace(1)* %in.gep + %fabs = call double @llvm.fabs.f64(double %val) + store double %fabs, double addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @uniform_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in, i64 %idx) { +; GCN-LABEL: name: uniform_fabs_f64 +; GCN-LABEL: bb.0 (%ir-block.0) +; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 +; FP16: %[[VREG64:[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR +; GCN: %[[LO32:[0-9]+]]:sreg_32 = COPY %[[VREG64]].sub0 +; GCN: %[[HI32:[0-9]+]]:sreg_32 = COPY %[[VREG64]].sub1 +; GCN: %[[SREG_MASK:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647 +; GCN: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %[[HI32]], killed %[[SREG_MASK]] +; GCN: %[[AND_COPY:[0-9]+]]:sreg_32 = COPY %[[AND]] +; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[AND_COPY]], %subreg.sub1 + + + %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %idx + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %idx + %val = load volatile double, double addrspace(1)* %in.gep + %fabs = call double @llvm.fabs.f64(double %val) + store double %fabs, double addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @divergent_fneg_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) { +; GCN-LABEL: name: divergent_fneg_fabs_f64 +; GCN-LABEL: bb.0 (%ir-block.0) +; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 +; FP16: %[[VREG64:[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR +; GCN: %[[HI32:[0-9]+]]:vgpr_32 = COPY %[[VREG64]].sub1 +; GCN: %[[SREG_MASK:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 +; GCN: %[[OR:[0-9]+]]:vgpr_32 = V_OR_B32_e32 killed %[[SREG_MASK]], killed %[[HI32]] +; GCN: %[[LO32:[0-9]+]]:vgpr_32 = COPY %[[VREG64]].sub0 +; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[OR]], %subreg.sub1 + + + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext + %val = load volatile double, double addrspace(1)* %in.gep + %fabs = call double @llvm.fabs.f64(double %val) + %fneg = fneg double %fabs + store double %fneg, double addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @uniform_fneg_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in, i64 %idx) { +; GCN-LABEL: name: uniform_fneg_fabs_f64 +; GCN-LABEL: bb.0 (%ir-block.0) +; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 +; FP16: %[[VREG64:[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR +; GCN: %[[LO32:[0-9]+]]:sreg_32 = COPY %[[VREG64]].sub0 +; GCN: %[[HI32:[0-9]+]]:sreg_32 = COPY %[[VREG64]].sub1 +; GCN: %[[SREG_MASK:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 +; GCN: %[[OR:[0-9]+]]:sreg_32 = S_OR_B32 killed %[[HI32]], killed %[[SREG_MASK]] +; GCN: %[[OR_COPY:[0-9]+]]:sreg_32 = COPY %[[OR]] +; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[OR_COPY]], %subreg.sub1 + + + %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %idx + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %idx + %val = load volatile double, double addrspace(1)* %in.gep + %fabs = call double @llvm.fabs.f64(double %val) + %fneg = fneg double %fabs + store double %fneg, double addrspace(1)* %out.gep + ret void +} + +declare float @llvm.fabs.f32(float) +declare half @llvm.fabs.f16(half) +declare double @llvm.fabs.f64(double) +declare <2 x half> @llvm.fabs.v2f16(<2 x half>) +declare <2 x float> @llvm.fabs.v2f32(<2 x float>) + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -44,8 +44,8 @@ } ; GCN-LABEL: {{^}}fneg_fabs_fn_free_f64: -; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}} -; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; SI: s_bitset1_b32 +; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 define amdgpu_kernel void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { %bc = bitcast i64 %in to double %fabs = call double @fabs(double %bc) @@ -55,11 +55,11 @@ } ; GCN-LABEL: {{^}}fneg_fabs_f64: -; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}} ; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x13 ; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x4c -; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]] +; GCN-DAG: s_bitset1_b32 s[[HI_X]], 31 ; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]] +; GCN-DAG: v_mov_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}} define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, [8 x i32], double %in) { %fabs = call double @llvm.fabs.f64(double %in) @@ -69,10 +69,10 @@ } ; GCN-LABEL: {{^}}fneg_fabs_v2f64: -; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}} +; GCN: s_brev_b32 [[IMMREG:s[0-9]+]], 1{{$}} ; GCN-NOT: 0x80000000 -; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) %fsub = fsub <2 x double> , %fabs @@ -81,12 +81,12 @@ } ; GCN-LABEL: {{^}}fneg_fabs_v4f64: -; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}} +; GCN: s_brev_b32 [[IMMREG:s[0-9]+]], 1{{$}} ; GCN-NOT: 0x80000000 -; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] define amdgpu_kernel void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) %fsub = fsub <4 x double> , %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fneg.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}fneg_f64: -; GCN: v_xor_b32 +; GCN: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 define amdgpu_kernel void @fneg_f64(double addrspace(1)* %out, double %in) { %fneg = fsub double -0.000000e+00, %in store double %fneg, double addrspace(1)* %out @@ -10,8 +10,8 @@ } ; FUNC-LABEL: {{^}}fneg_v2f64: -; GCN: v_xor_b32 -; GCN: v_xor_b32 +; GCN: s_xor_b32 +; GCN: s_xor_b32 define amdgpu_kernel void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) { %fneg = fsub <2 x double> , %in store <2 x double> %fneg, <2 x double> addrspace(1)* %out @@ -24,10 +24,10 @@ ; R600: -PV ; R600: -PV -; GCN: v_xor_b32 -; GCN: v_xor_b32 -; GCN: v_xor_b32 -; GCN: v_xor_b32 +; GCN: s_xor_b32 +; GCN: s_xor_b32 +; GCN: s_xor_b32 +; GCN: s_xor_b32 define amdgpu_kernel void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) { %fneg = fsub <4 x double> , %in store <4 x double> %fneg, <4 x double> addrspace(1)* %out