Index: llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -969,12 +969,19 @@ return true; } + // If we should use the generic intrinsic instead of emitting a libcall + const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy(); + // powr ---> exp2(y * log2(x)) // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31)) - FunctionCallee ExpExpr = - getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo)); - if (!ExpExpr) - return false; + FunctionCallee ExpExpr; + if (ShouldUseIntrinsic) + ExpExpr = Intrinsic::getDeclaration(M, Intrinsic::exp2, {FPOp->getType()}); + else { + ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo)); + if (!ExpExpr) + return false; + } bool needlog = false; bool needabs = false; @@ -1043,10 +1050,16 @@ nval = cnval ? cnval : opr0; } if (needlog) { - FunctionCallee LogExpr = - getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo)); - if (!LogExpr) - return false; + FunctionCallee LogExpr; + if (ShouldUseIntrinsic) { + LogExpr = + Intrinsic::getDeclaration(M, Intrinsic::log2, {FPOp->getType()}); + } else { + LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo)); + if (!LogExpr) + return false; + } + nval = CreateCallEx(B,LogExpr, nval, "__log2"); } Index: llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -57,10 +57,62 @@ ; CHECK-LABEL: test_pow_fast_f16__integral_y: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 +; CHECK-NEXT: v_log_f16_e64 v2, |v0| +; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CHECK-NEXT: v_mul_f16_e32 v2, v2, v1 +; CHECK-NEXT: v_exp_f16_e32 v2, v2 +; CHECK-NEXT: v_cvt_i16_f16_e32 v1, v1 +; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1 +; CHECK-NEXT: v_and_b32_e32 v0, v1, v0 +; CHECK-NEXT: v_or_b32_e32 v0, v0, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %y = sitofp i32 %y.i to half + %pow = tail call fast half @_Z3powDhDh(half %x, half %y) + ret half %pow +} + +define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) { +; CHECK-LABEL: test_pow_fast_f32__integral_y: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x800000 +; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc +; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3 +; CHECK-NEXT: v_log_f32_e32 v3, v3 +; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 +; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, v2, v1 +; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 +; CHECK-NEXT: v_mov_b32_e32 v4, 0x42800000 +; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; CHECK-NEXT: v_fma_f32 v2, v2, v1, v3 +; CHECK-NEXT: v_exp_f32_e32 v2, v2 +; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc +; CHECK-NEXT: v_mul_f32_e32 v2, v2, v3 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %y = sitofp i32 %y.i to float + %pow = tail call fast float @_Z3powff(float %x, float %y) + ret float %pow +} + +define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { +; CHECK-LABEL: test_pow_fast_f64__integral_y: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] ; CHECK-NEXT: v_writelane_b32 v40, s16, 14 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 @@ -76,19 +128,20 @@ ; CHECK-NEXT: v_writelane_b32 v40, s41, 9 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2Dh@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2Dh@gotpcrel32@hi+12 -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v42, v0 -; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v1 +; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: v_writelane_b32 v40, s42, 10 +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: v_writelane_b32 v40, s43, 11 +; CHECK-NEXT: v_mov_b32_e32 v43, v1 ; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_cvt_f16_f32_e32 v43, v0 -; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v42 +; CHECK-NEXT: v_mov_b32_e32 v42, v2 +; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v43 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: v_writelane_b32 v40, s45, 13 ; CHECK-NEXT: v_mov_b32_e32 v41, v31 @@ -99,13 +152,14 @@ ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: v_cvt_f64_i32_e32 v[44:45], v42 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[44:45] ; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2Dh@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2Dh@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_mul_f16_e32 v0, v0, v43 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] @@ -117,13 +171,14 @@ ; CHECK-NEXT: v_mov_b32_e32 v31, v41 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_i16_f16_e32 v1, v43 -; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1 -; CHECK-NEXT: v_and_b32_e32 v1, v1, v42 -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_or_b32_e32 v0, v1, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v42 +; CHECK-NEXT: v_and_b32_e32 v2, v2, v43 +; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_readlane_b32 s45, v40, 13 ; CHECK-NEXT: v_readlane_b32 s44, v40, 12 ; CHECK-NEXT: v_readlane_b32 s43, v40, 11 @@ -140,19 +195,63 @@ ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: v_readlane_b32 s4, v40, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] - %y = sitofp i32 %y.i to half - %pow = tail call fast half @_Z3powDhDh(half %x, half %y) - ret half %pow + %y = sitofp i32 %y.i to double + %pow = tail call fast double @_Z3powdd(double %x, double %y) + ret double %pow } -define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) { -; CHECK-LABEL: test_pow_fast_f32__integral_y: +; -------------------------------------------------------------------- +; test powr +; -------------------------------------------------------------------- + +define half @test_powr_fast_f16(half %x, half %y) { +; CHECK-LABEL: test_powr_fast_f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_log_f16_e32 v0, v0 +; CHECK-NEXT: v_mul_f16_e32 v0, v0, v1 +; CHECK-NEXT: v_exp_f16_e32 v0, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %powr = tail call fast half @_Z4powrDhDh(half %x, half %y) + ret half %powr +} + +define float @test_powr_fast_f32(float %x, float %y) { +; CHECK-LABEL: test_powr_fast_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x800000 +; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc +; CHECK-NEXT: v_mul_f32_e32 v0, v0, v3 +; CHECK-NEXT: v_log_f32_e32 v0, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 +; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 +; CHECK-NEXT: v_sub_f32_e32 v0, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v2, v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x42800000 +; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; CHECK-NEXT: v_fma_f32 v0, v0, v1, v2 +; CHECK-NEXT: v_exp_f32_e32 v0, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; CHECK-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %powr = tail call fast float @_Z4powrff(float %x, float %y) + ret float %powr +} + +define double @test_powr_fast_f64(double %x, double %y) { +; CHECK-LABEL: test_powr_fast_f64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s16, s33 @@ -174,20 +273,18 @@ ; CHECK-NEXT: v_writelane_b32 v40, s41, 9 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2f@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2f@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_mov_b32_e32 v42, v0 ; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v42 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 +; CHECK-NEXT: v_mov_b32_e32 v43, v31 ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 ; CHECK-NEXT: s_mov_b32 s44, s13 @@ -195,14 +292,15 @@ ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: v_cvt_f32_i32_e32 v43, v1 +; CHECK-NEXT: v_mov_b32_e32 v42, v3 +; CHECK-NEXT: v_mov_b32_e32 v41, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[41:42] ; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2f@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2f@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v43 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] @@ -211,18 +309,15 @@ ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 ; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v31, v43 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v43 -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; CHECK-NEXT: v_and_or_b32 v0, v1, v42, v0 ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: v_readlane_b32 s45, v40, 13 +; CHECK-NEXT: v_readlane_b32 s44, v40, 12 +; CHECK-NEXT: v_readlane_b32 s43, v40, 11 ; CHECK-NEXT: v_readlane_b32 s42, v40, 10 ; CHECK-NEXT: v_readlane_b32 s41, v40, 9 ; CHECK-NEXT: v_readlane_b32 s40, v40, 8 @@ -242,19 +337,70 @@ ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] - %y = sitofp i32 %y.i to float - %pow = tail call fast float @_Z3powff(float %x, float %y) - ret float %pow + %powr = tail call fast double @_Z4powrdd(double %x, double %y) + ret double %powr } -define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { -; CHECK-LABEL: test_pow_fast_f64__integral_y: +; -------------------------------------------------------------------- +; test pown +; -------------------------------------------------------------------- + +define half @test_pown_fast_f16(half %x, i32 %y) { +; CHECK-LABEL: test_pown_fast_f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cvt_f32_i32_e32 v2, v1 +; CHECK-NEXT: v_log_f16_e64 v3, |v0| +; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1 +; CHECK-NEXT: v_and_b32_e32 v0, v1, v0 +; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2 +; CHECK-NEXT: v_exp_f16_e32 v2, v2 +; CHECK-NEXT: v_or_b32_e32 v0, v0, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call = tail call fast half @_Z4pownDhi(half %x, i32 %y) + ret half %call +} + +define float @test_pown_fast_f32(float %x, i32 %y) { +; CHECK-LABEL: test_pown_fast_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x800000 +; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc +; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3 +; CHECK-NEXT: v_log_f32_e32 v3, v3 +; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 +; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, v2, v4 +; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 +; CHECK-NEXT: v_mov_b32_e32 v5, 0x42800000 +; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3 +; CHECK-NEXT: v_exp_f32_e32 v2, v2 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; CHECK-NEXT: v_mul_f32_e32 v2, v2, v3 +; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call = tail call fast float @_Z4pownfi(float %x, i32 %y) + ret float %call +} + +define double @test_pown_fast_f64(double %x, i32 %y) { +; CHECK-LABEL: test_pown_fast_f64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] ; CHECK-NEXT: v_writelane_b32 v40, s16, 14 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 @@ -274,15 +420,12 @@ ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: v_writelane_b32 v40, s43, 11 ; CHECK-NEXT: v_mov_b32_e32 v43, v1 ; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_mov_b32_e32 v42, v2 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v43 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: v_writelane_b32 v40, s45, 13 @@ -294,15 +437,16 @@ ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: v_cvt_f64_i32_e32 v[44:45], v42 +; CHECK-NEXT: v_mov_b32_e32 v42, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[44:45] +; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -315,11 +459,9 @@ ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v42 ; CHECK-NEXT: v_and_b32_e32 v2, v2, v43 -; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_readlane_b32 s45, v40, 13 ; CHECK-NEXT: v_readlane_b32 s44, v40, 12 @@ -337,23 +479,65 @@ ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: v_readlane_b32 s4, v40, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] - %y = sitofp i32 %y.i to double - %pow = tail call fast double @_Z3powdd(double %x, double %y) - ret double %pow + %call = tail call fast double @_Z4powndi(double %x, i32 %y) + ret double %call } -; -------------------------------------------------------------------- -; test powr -; -------------------------------------------------------------------- +define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) { +; CHECK-LABEL: test_pown_fast_f16_known_even: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 +; CHECK-NEXT: v_log_f16_e64 v0, |v0| +; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CHECK-NEXT: v_mul_f16_e32 v0, v0, v1 +; CHECK-NEXT: v_exp_f16_e32 v0, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %y = shl i32 %y.arg, 1 + %call = tail call fast half @_Z4pownDhi(half %x, i32 %y) + ret half %call +} -define half @test_powr_fast_f16(half %x, half %y) { -; CHECK-LABEL: test_powr_fast_f16: +define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) { +; CHECK-LABEL: test_pown_fast_f32_known_even: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x800000 +; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc +; CHECK-NEXT: v_mul_f32_e64 v0, |v0|, v3 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; CHECK-NEXT: v_log_f32_e32 v0, v0 +; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 +; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; CHECK-NEXT: v_sub_f32_e32 v0, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v2, v0, v1 +; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x42800000 +; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; CHECK-NEXT: v_fma_f32 v0, v0, v1, v2 +; CHECK-NEXT: v_exp_f32_e32 v0, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; CHECK-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %y = shl i32 %y.arg, 1 + %call = tail call fast float @_Z4pownfi(float %x, i32 %y) + ret float %call +} + +define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { +; CHECK-LABEL: test_pown_fast_f64_known_even: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s16, s33 @@ -375,12 +559,13 @@ ; CHECK-NEXT: v_writelane_b32 v40, s41, 9 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2Dh@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2Dh@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: v_writelane_b32 v40, s42, 10 ; CHECK-NEXT: v_writelane_b32 v40, s43, 11 ; CHECK-NEXT: v_writelane_b32 v40, s44, 12 +; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill @@ -393,15 +578,16 @@ ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v42, v1 +; CHECK-NEXT: v_lshlrev_b32_e32 v42, 1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42 ; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2Dh@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2Dh@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_mul_f16_e32 v0, v0, v42 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -436,846 +622,23 @@ ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] - %powr = tail call fast half @_Z4powrDhDh(half %x, half %y) - ret half %powr + %y = shl i32 %y.arg, 1 + %call = tail call fast double @_Z4powndi(double %x, i32 %y) + ret double %call } -define float @test_powr_fast_f32(float %x, float %y) { -; CHECK-LABEL: test_powr_fast_f32: +define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) { +; CHECK-LABEL: test_pown_fast_f16_known_odd: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2f@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2f@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v42, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2f@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2f@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v42 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] - %powr = tail call fast float @_Z4powrff(float %x, float %y) - ret float %powr -} - -define double @test_powr_fast_f64(double %x, double %y) { -; CHECK-LABEL: test_powr_fast_f64: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v43, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v42, v3 -; CHECK-NEXT: v_mov_b32_e32 v41, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[41:42] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v43 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] - %powr = tail call fast double @_Z4powrdd(double %x, double %y) - ret double %powr -} - -; -------------------------------------------------------------------- -; test pown -; -------------------------------------------------------------------- - -define half @test_pown_fast_f16(half %x, i32 %y) { -; CHECK-LABEL: test_pown_fast_f16: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2Dh@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2Dh@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_mov_b32_e32 v43, v0 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v43 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v42, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v42 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2Dh@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2Dh@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: v_mul_f16_e32 v0, v0, v1 -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v42 -; CHECK-NEXT: v_and_b32_e32 v1, v1, v43 -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_or_b32_e32 v0, v1, v0 -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] - %call = tail call fast half @_Z4pownDhi(half %x, i32 %y) - ret half %call -} - -define float @test_pown_fast_f32(float %x, i32 %y) { -; CHECK-LABEL: test_pown_fast_f32: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2f@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2f@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_mov_b32_e32 v43, v0 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v43 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v42, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2f@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2f@gotpcrel32@hi+12 -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v42 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v42 -; CHECK-NEXT: v_and_or_b32 v0, v1, v43, v0 -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] - %call = tail call fast float @_Z4pownfi(float %x, i32 %y) - ret float %call -} - -define double @test_pown_fast_f64(double %x, i32 %y) { -; CHECK-LABEL: test_pown_fast_f64: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_mov_b32_e32 v43, v1 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v43 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v42, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v42 -; CHECK-NEXT: v_and_b32_e32 v2, v2, v43 -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] - %call = tail call fast double @_Z4powndi(double %x, i32 %y) - ret double %call -} - -define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) { -; CHECK-LABEL: test_pown_fast_f16_known_even: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2Dh@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2Dh@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: v_lshlrev_b32_e32 v42, 1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v42 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2Dh@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2Dh@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: v_mul_f16_e32 v0, v0, v1 -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] - %y = shl i32 %y.arg, 1 - %call = tail call fast half @_Z4pownDhi(half %x, i32 %y) - ret half %call -} - -define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) { -; CHECK-LABEL: test_pown_fast_f32_known_even: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2f@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2f@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: v_lshlrev_b32_e32 v42, 1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2f@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2f@gotpcrel32@hi+12 -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v42 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] - %y = shl i32 %y.arg, 1 - %call = tail call fast float @_Z4pownfi(float %x, i32 %y) - ret float %call -} - -define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { -; CHECK-LABEL: test_pown_fast_f64_known_even: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: v_lshlrev_b32_e32 v42, 1, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] - %y = shl i32 %y.arg, 1 - %call = tail call fast double @_Z4powndi(double %x, i32 %y) - ret double %call -} - -define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) { -; CHECK-LABEL: test_pown_fast_f16_known_odd: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2Dh@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2Dh@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_mov_b32_e32 v42, v0 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v42 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: v_or_b32_e32 v43, 1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v43 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2Dh@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2Dh@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: v_or_b32_e32 v1, 1, v1 +; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 +; CHECK-NEXT: v_log_f16_e64 v2, |v0| +; CHECK-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 ; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: v_mul_f16_e32 v0, v0, v1 -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_and_b32_e32 v1, 0xffff8000, v42 -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_or_b32_e32 v0, v1, v0 -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mul_f16_e32 v1, v2, v1 +; CHECK-NEXT: v_exp_f16_e32 v1, v1 +; CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %y = or i32 %y.arg, 1 %call = tail call fast half @_Z4pownDhi(half %x, i32 %y) @@ -1286,92 +649,29 @@ ; CHECK-LABEL: test_pown_fast_f32_known_odd: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2f@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2f@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_mov_b32_e32 v42, v0 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v42 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: v_or_b32_e32 v43, 1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2f@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2f@gotpcrel32@hi+12 -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v43 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_mov_b32 s4, 0x800000 +; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc +; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3 +; CHECK-NEXT: v_or_b32_e32 v1, 1, v1 +; CHECK-NEXT: v_log_f32_e32 v3, v3 +; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 +; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, v2, v1 +; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 +; CHECK-NEXT: v_mov_b32_e32 v4, 0x42800000 +; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; CHECK-NEXT: v_fma_f32 v1, v2, v1, v3 +; CHECK-NEXT: v_exp_f32_e32 v1, v1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; CHECK-NEXT: s_brev_b32 s4, 1 -; CHECK-NEXT: v_and_or_b32 v0, v42, s4, v0 -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 +; CHECK-NEXT: v_and_or_b32 v0, v0, s4, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %y = or i32 %y.arg, 1 %call = tail call fast float @_Z4pownfi(float %x, i32 %y) Index: llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll @@ -1074,9 +1074,9 @@ define float @test_pow_afn_f32_nnan_ninf_x_known_positive(float nofpclass(ninf nnorm nsub) %x, float %y) { ; CHECK-LABEL: define float @test_pow_afn_f32_nnan_ninf_x_known_positive ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]], float [[Y:%.*]]) { -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[X]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: ret float [[__EXP2]] ; %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y) @@ -1096,9 +1096,9 @@ define <2 x float> @test_pow_afn_v2f32_nnan_ninf_x_known_positive(<2 x float> nofpclass(ninf nnorm nsub) %x, <2 x float> %y) { ; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32_nnan_ninf_x_known_positive ; CHECK-SAME: (<2 x float> nofpclass(ninf nsub nnorm) [[X:%.*]], <2 x float> [[Y:%.*]]) { -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[X]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]]) ; CHECK-NEXT: ret <2 x float> [[__EXP2]] ; %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y) @@ -1242,9 +1242,9 @@ define half @test_pow_afn_f16_nnan_ninf_x_known_positive(half nofpclass(ninf nnorm nsub) %x, half %y) { ; CHECK-LABEL: define half @test_pow_afn_f16_nnan_ninf_x_known_positive ; CHECK-SAME: (half nofpclass(ninf nsub nnorm) [[X:%.*]], half [[Y:%.*]]) { -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn half @_Z4log2Dh(half [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn half @llvm.log2.f16(half [[X]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn half [[__LOG2]], [[Y]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn half @_Z4exp2Dh(half [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn half @llvm.exp2.f16(half [[__YLOGX]]) ; CHECK-NEXT: ret half [[__EXP2]] ; %pow = tail call afn nnan ninf half @_Z3powDhDh(half %x, half %y) @@ -1264,9 +1264,9 @@ define <2 x half> @test_pow_afn_v2f16_nnan_ninf_x_known_positive(<2 x half> nofpclass(ninf nnorm nsub) %x, <2 x half> %y) { ; CHECK-LABEL: define <2 x half> @test_pow_afn_v2f16_nnan_ninf_x_known_positive ; CHECK-SAME: (<2 x half> nofpclass(ninf nsub nnorm) [[X:%.*]], <2 x half> [[Y:%.*]]) { -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x half> @_Z4log2Dv2_Dh(<2 x half> [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x half> @llvm.log2.v2f16(<2 x half> [[X]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x half> [[__LOG2]], [[Y]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x half> @_Z4exp2Dv2_Dh(<2 x half> [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x half> @llvm.exp2.v2f16(<2 x half> [[__YLOGX]]) ; CHECK-NEXT: ret <2 x half> [[__EXP2]] ; %pow = tail call afn nnan ninf <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> %y) @@ -1784,9 +1784,6 @@ define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison(<2 x float> %x) { ; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison ; CHECK-SAME: (<2 x float> [[X:%.*]]) { -; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]]) -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> poison) ; CHECK-NEXT: ret <2 x float> poison ; %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> poison) @@ -2209,9 +2206,9 @@ ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp i32 [[Y]] to float ; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[__FABS]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32 ; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32 @@ -2291,9 +2288,9 @@ ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp i32 [[Y]] to float ; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[__FABS]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32 ; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32 @@ -2339,9 +2336,9 @@ ; CHECK-SAME: (float [[X:%.*]], i256 [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp i256 [[Y]] to float ; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[__FABS]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32 ; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32 @@ -2361,9 +2358,9 @@ ; CHECK-SAME: (float [[X:%.*]], i256 [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp i256 [[Y]] to float ; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[__FABS]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32 ; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32 @@ -2383,9 +2380,9 @@ ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp <2 x i32> [[Y]] to <2 x float> ; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[__FABS]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]]) ; CHECK-NEXT: [[__YTOU:%.*]] = fptosi <2 x float> [[Y_CAST]] to <2 x i32> ; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[__YTOU]], ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[X]] to <2 x i32> @@ -2429,9 +2426,9 @@ ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp <2 x i32> [[Y]] to <2 x float> ; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[__FABS]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]]) ; CHECK-NEXT: [[__YTOU:%.*]] = fptosi <2 x float> [[Y_CAST]] to <2 x i32> ; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[__YTOU]], ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[X]] to <2 x i32> @@ -2475,9 +2472,9 @@ ; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32__known_positive_x__known_integral_sitofp ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp i32 [[Y]] to float -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[X]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: ret float [[__EXP2]] ; %y.cast = sitofp i32 %y to float Index: llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll @@ -672,10 +672,10 @@ ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[__FABS]]) ; CHECK-NEXT: [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[POWNI2F]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[Y]], 31 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[X]] to i32 ; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]] @@ -694,10 +694,10 @@ ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[__FABS]]) ; CHECK-NEXT: [[POWNI2F:%.*]] = sitofp <2 x i32> [[Y]] to <2 x float> ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[POWNI2F]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]]) ; CHECK-NEXT: [[__YTOU:%.*]] = fptosi <2 x float> [[POWNI2F]] to <2 x i32> ; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[__YTOU]], ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <2 x i32> @@ -763,10 +763,10 @@ ; CHECK-SAME: (half [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn half @llvm.fabs.f16(half [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn half @_Z4log2Dh(half [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn half @llvm.log2.f16(half [[__FABS]]) ; CHECK-NEXT: [[POWNI2F:%.*]] = sitofp i32 [[Y]] to half ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn half [[__LOG2]], [[POWNI2F]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn half @_Z4exp2Dh(half [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn half @llvm.exp2.f16(half [[__YLOGX]]) ; CHECK-NEXT: [[__YTOU:%.*]] = trunc i32 [[Y]] to i16 ; CHECK-NEXT: [[__YEVEN:%.*]] = shl i16 [[__YTOU]], 15 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast half [[X]] to i16 @@ -786,10 +786,10 @@ ; CHECK-SAME: (<2 x half> [[X:%.*]], <2 x i32> [[Y:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn <2 x half> @llvm.fabs.v2f16(<2 x half> [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x half> @_Z4log2Dv2_Dh(<2 x half> [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x half> @llvm.log2.v2f16(<2 x half> [[__FABS]]) ; CHECK-NEXT: [[POWNI2F:%.*]] = sitofp <2 x i32> [[Y]] to <2 x half> ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x half> [[__LOG2]], [[POWNI2F]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x half> @_Z4exp2Dv2_Dh(<2 x half> [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x half> @llvm.exp2.v2f16(<2 x half> [[__YLOGX]]) ; CHECK-NEXT: [[__YTOU:%.*]] = fptosi <2 x half> [[POWNI2F]] to <2 x i16> ; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i16> [[__YTOU]], ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x half> [[X]] to <2 x i16> @@ -821,10 +821,10 @@ ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @_Z4log2f(float [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]]) ; CHECK-NEXT: [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[POWNI2F]] -; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[Y]], 31 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[X]] to i32 ; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]] @@ -841,9 +841,6 @@ define float @test_pown_fast_f32__y_poison(float %x) { ; CHECK-LABEL: define float @test_pown_fast_f32__y_poison ; CHECK-SAME: (float [[X:%.*]]) { -; CHECK-NEXT: [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @_Z4log2f(float [[__FABS]]) -; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @_Z4exp2f(float poison) ; CHECK-NEXT: ret float poison ; %call = tail call fast float @_Z4pownfi(float %x, i32 poison) @@ -1070,10 +1067,10 @@ ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[__FABS]]) ; CHECK-NEXT: [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[POWNI2F]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[Y]], 31 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[X]] to i32 ; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]] @@ -1131,10 +1128,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[Y:%.*]] = shl i32 [[Y_ARG]], 1 ; CHECK-NEXT: [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @_Z4log2f(float [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]]) ; CHECK-NEXT: [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[POWNI2F]] -; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: ret float [[__EXP2]] ; entry: @@ -1149,10 +1146,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[Y:%.*]] = shl i32 [[Y_ARG]], 1 ; CHECK-NEXT: [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @_Z4log2f(float [[__FABS]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]]) ; CHECK-NEXT: [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[POWNI2F]] -; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: ret float [[__EXP2]] ; entry: Index: llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr.ll @@ -25,9 +25,9 @@ define float @test_powr_fast_f32(float %x, float %y) { ; CHECK-LABEL: define float @test_powr_fast_f32 ; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) { -; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @_Z4log2f(float [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[X]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[Y]] -; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: ret float [[__EXP2]] ; %powr = tail call fast float @_Z4powrff(float %x, float %y) @@ -37,9 +37,9 @@ define <2 x float> @test_powr_fast_v2f32(<2 x float> %x, <2 x float> %y) { ; CHECK-LABEL: define <2 x float> @test_powr_fast_v2f32 ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) { -; CHECK-NEXT: [[__LOG2:%.*]] = call fast <2 x float> @_Z4log2Dv2_f(<2 x float> [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call fast <2 x float> @llvm.log2.v2f32(<2 x float> [[X]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul fast <2 x float> [[__LOG2]], [[Y]] -; CHECK-NEXT: [[__EXP2:%.*]] = call fast <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call fast <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]]) ; CHECK-NEXT: ret <2 x float> [[__EXP2]] ; %powr = tail call fast <2 x float> @_Z4powrDv2_fS_(<2 x float> %x, <2 x float> %y) @@ -449,7 +449,7 @@ define float @test_powr_afn_f32_noinline(float %x, float %y) { ; CHECK-LABEL: define float @test_powr_afn_f32_noinline ; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) { -; CHECK-NEXT: [[POWR:%.*]] = tail call afn float @_Z4powrff(float [[X]], float [[Y]]) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[POWR:%.*]] = tail call afn float @_Z4powrff(float [[X]], float [[Y]]) #[[ATTR4:[0-9]+]] ; CHECK-NEXT: ret float [[POWR]] ; %powr = tail call afn float @_Z4powrff(float %x, float %y) #1 @@ -459,7 +459,7 @@ define float @test_powr_afn_f32_nnan_noinline(float %x, float %y) { ; CHECK-LABEL: define float @test_powr_afn_f32_nnan_noinline ; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) { -; CHECK-NEXT: [[POWR:%.*]] = tail call nnan afn float @_Z4powrff(float [[X]], float [[Y]]) #[[ATTR3]] +; CHECK-NEXT: [[POWR:%.*]] = tail call nnan afn float @_Z4powrff(float [[X]], float [[Y]]) #[[ATTR4]] ; CHECK-NEXT: ret float [[POWR]] ; %powr = tail call afn nnan float @_Z4powrff(float %x, float %y) #1 @@ -479,7 +479,7 @@ define float @test_powr_fast_f32_nobuiltin(float %x, float %y) { ; CHECK-LABEL: define float @test_powr_fast_f32_nobuiltin ; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) { -; CHECK-NEXT: [[POWR:%.*]] = tail call fast float @_Z4powrff(float [[X]], float [[Y]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: [[POWR:%.*]] = tail call fast float @_Z4powrff(float [[X]], float [[Y]]) #[[ATTR5:[0-9]+]] ; CHECK-NEXT: ret float [[POWR]] ; %powr = tail call fast float @_Z4powrff(float %x, float %y) #3 @@ -1010,9 +1010,9 @@ define float @test_powr_afn_f32_nnan_ninf_x_known_positive(float nofpclass(ninf nnorm nsub) %x, float %y) { ; CHECK-LABEL: define float @test_powr_afn_f32_nnan_ninf_x_known_positive ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]], float [[Y:%.*]]) { -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[X]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: ret float [[__EXP2]] ; %powr = tail call afn nnan ninf float @_Z4powrff(float %x, float %y) @@ -1032,9 +1032,9 @@ define <2 x float> @test_powr_afn_v2f32_nnan_ninf_x_known_positive(<2 x float> nofpclass(ninf nnorm nsub) %x, <2 x float> %y) { ; CHECK-LABEL: define <2 x float> @test_powr_afn_v2f32_nnan_ninf_x_known_positive ; CHECK-SAME: (<2 x float> nofpclass(ninf nsub nnorm) [[X:%.*]], <2 x float> [[Y:%.*]]) { -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[X]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]]) ; CHECK-NEXT: ret <2 x float> [[__EXP2]] ; %powr = tail call afn nnan ninf <2 x float> @_Z4powrDv2_fS_(<2 x float> %x, <2 x float> %y) @@ -1109,9 +1109,9 @@ ; CHECK-LABEL: define float @test_powr_afn_nnan_ninf_f32_known_integral_sitofp ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp i32 [[Y]] to float -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[X]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: ret float [[__EXP2]] ; %y.cast = sitofp i32 %y to float @@ -1147,9 +1147,9 @@ ; CHECK-LABEL: define float @test_powr_afn_nnan_ninf_f32_known_integral_uitofp ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp i32 [[Y]] to float -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[X]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: ret float [[__EXP2]] ; %y.cast = uitofp i32 %y to float @@ -1161,9 +1161,9 @@ ; CHECK-LABEL: define <2 x float> @test_powr_afn_nnan_ninf_v2f32_known_integral_sitofp ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp <2 x i32> [[Y]] to <2 x float> -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[X]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]]) ; CHECK-NEXT: ret <2 x float> [[__EXP2]] ; %y.cast = sitofp <2 x i32> %y to <2 x float> @@ -1199,9 +1199,9 @@ ; CHECK-LABEL: define <2 x float> @test_powr_afn_nnan_ninf_v2f32_known_integral_uitofp ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp <2 x i32> [[Y]] to <2 x float> -; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[X]]) ; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]] -; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]]) +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]]) ; CHECK-NEXT: ret <2 x float> [[__EXP2]] ; %y.cast = uitofp <2 x i32> %y to <2 x float> Index: llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll +++ llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -351,10 +351,10 @@ ; GCN-LABEL: {{^}}define half @test_pown_f16( ; GCN-NATIVE: %__fabs = tail call fast half @llvm.fabs.f16(half %x) -; GCN-NATIVE: %__log2 = tail call fast half @_Z4log2Dh(half %__fabs) +; GCN-NATIVE: %__log2 = tail call fast half @llvm.log2.f16(half %__fabs) ; GCN-NATIVE: %pownI2F = sitofp i32 %y to half ; GCN-NATIVE: %__ylogx = fmul fast half %__log2, %pownI2F -; GCN-NATIVE: %__exp2 = tail call fast half @_Z4exp2Dh(half %__ylogx) +; GCN-NATIVE: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx) ; GCN-NATIVE: %__ytou = trunc i32 %y to i16 ; GCN-NATIVE: %__yeven = shl i16 %__ytou, 15 ; GCN-NATIVE: %0 = bitcast half %x to i16 @@ -371,16 +371,15 @@ declare float @_Z4pownfi(float, i32) ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow -; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 1.013000e+03) -; GCN-PRELINK: %__fabs = tail call fast float @llvm.fabs.f32(float %tmp) -; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs) -; GCN-PRELINK: %__ylogx = fmul fast float %__log2, 1.013000e+03 -; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx) -; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32 -; GCN-PRELINK: %__pow_sign = and i32 %[[r0]], -2147483648 -; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32 -; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]] -; GCN-PRELINK: store i32 %[[r2]], ptr addrspace(1) %a, align 4 +; GCN: %__fabs = tail call fast float @llvm.fabs.f32(float %tmp) +; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %__fabs) +; GCN: %__ylogx = fmul fast float %__log2, 1.013000e+03 +; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx) +; GCN: %[[r0:.*]] = bitcast float %tmp to i32 +; GCN: %__pow_sign = and i32 %[[r0]], -2147483648 +; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32 +; GCN: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]] +; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4 define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) { entry: %tmp = load float, ptr addrspace(1) %a, align 4 @@ -390,15 +389,10 @@ } ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr -; GCN-POSTLINK: call fast float @_Z4powrff(float %tmp, float %tmp1) -; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %tmp) -; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %tmp1 -; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx) -; GCN-PRELINK: store float %__exp2, ptr addrspace(1) %a, align 4 -; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp) -; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1 -; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx) -; GCN-NATIVE: store float %__exp2, ptr addrspace(1) %a, align 4 +; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %tmp) +; GCN: %__ylogx = fmul fast float %tmp1, %__log2 +; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx) +; GCN: store float %__exp2, ptr addrspace(1) %a, align 4 define amdgpu_kernel void @test_powr(ptr addrspace(1) nocapture %a) { entry: %tmp = load float, ptr addrspace(1) %a, align 4 @@ -410,19 +404,18 @@ } ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown -; GCN-POSTLINK: call fast float @_Z4pownfi(float %tmp, i32 %conv) -; GCN-PRELINK: %conv = fptosi float %tmp1 to i32 -; GCN-PRELINK: %__fabs = tail call fast float @llvm.fabs.f32(float %tmp) -; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs) -; GCN-PRELINK: %pownI2F = sitofp i32 %conv to float -; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F -; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx) -; GCN-PRELINK: %__yeven = shl i32 %conv, 31 -; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32 -; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %[[r0]] -; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32 -; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]] -; GCN-PRELINK: store i32 %[[r2]], ptr addrspace(1) %a, align 4 +; GCN: %conv = fptosi float %tmp1 to i32 +; GCN: %__fabs = tail call fast float @llvm.fabs.f32(float %tmp) +; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %__fabs) +; GCN: %pownI2F = sitofp i32 %conv to float +; GCN: %__ylogx = fmul fast float %__log2, %pownI2F +; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx) +; GCN: %__yeven = shl i32 %conv, 31 +; GCN: %[[r0:.*]] = bitcast float %tmp to i32 +; GCN: %__pow_sign = and i32 %__yeven, %[[r0]] +; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32 +; GCN: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]] +; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4 define amdgpu_kernel void @test_pown(ptr addrspace(1) nocapture %a) { entry: %tmp = load float, ptr addrspace(1) %a, align 4 @@ -438,30 +431,30 @@ declare <2 x half> @_Z3powDv2_DhS_(<2 x half>, <2 x half>) ; GCN-LABEL: define half @test_pow_fast_f16__y_13(half %x) -; GCN-PRELINK: %__fabs = tail call fast half @llvm.fabs.f16(half %x) -; GCN-PRELINK: %__log2 = tail call fast half @_Z4log2Dh(half %__fabs) -; GCN-PRELINK: %__ylogx = fmul fast half %__log2, 0xH4A80 -; GCN-PRELINK: %__exp2 = tail call fast half @_Z4exp2Dh(half %__ylogx) -; GCN-PRELINK: %1 = bitcast half %x to i16 -; GCN-PRELINK: %__pow_sign = and i16 %1, -32768 -; GCN-PRELINK: %2 = bitcast half %__exp2 to i16 -; GCN-PRELINK: %3 = or i16 %__pow_sign, %2 -; GCN-PRELINK: %4 = bitcast i16 %3 to half +; GCN: %__fabs = tail call fast half @llvm.fabs.f16(half %x) +; GCN: %__log2 = tail call fast half @llvm.log2.f16(half %__fabs) +; GCN: %__ylogx = fmul fast half %__log2, 0xH4A80 +; GCN: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx) +; GCN: %1 = bitcast half %x to i16 +; GCN: %__pow_sign = and i16 %1, -32768 +; GCN: %2 = bitcast half %__exp2 to i16 +; GCN: %3 = or i16 %__pow_sign, %2 +; GCN: %4 = bitcast i16 %3 to half define half @test_pow_fast_f16__y_13(half %x) { %powr = tail call fast half @_Z3powDhDh(half %x, half 13.0) ret half %powr } ; GCN-LABEL: define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x) -; GCN-PRELINK: %__fabs = tail call fast <2 x half> @llvm.fabs.v2f16(<2 x half> %x) -; GCN-PRELINK: %__log2 = tail call fast <2 x half> @_Z4log2Dv2_Dh(<2 x half> %__fabs) -; GCN-PRELINK: %__ylogx = fmul fast <2 x half> %__log2, -; GCN-PRELINK: %__exp2 = tail call fast <2 x half> @_Z4exp2Dv2_Dh(<2 x half> %__ylogx) -; GCN-PRELINK: %1 = bitcast <2 x half> %x to <2 x i16> -; GCN-PRELINK: %__pow_sign = and <2 x i16> %1, -; GCN-PRELINK: %2 = bitcast <2 x half> %__exp2 to <2 x i16> -; GCN-PRELINK: %3 = or <2 x i16> %__pow_sign, %2 -; GCN-PRELINK: %4 = bitcast <2 x i16> %3 to <2 x half> +; GCN: %__fabs = tail call fast <2 x half> @llvm.fabs.v2f16(<2 x half> %x) +; GCN: %__log2 = tail call fast <2 x half> @llvm.log2.v2f16(<2 x half> %__fabs) +; GCN: %__ylogx = fmul fast <2 x half> %__log2, +; GCN: %__exp2 = tail call fast <2 x half> @llvm.exp2.v2f16(<2 x half> %__ylogx) +; GCN: %1 = bitcast <2 x half> %x to <2 x i16> +; GCN: %__pow_sign = and <2 x i16> %1, +; GCN: %2 = bitcast <2 x half> %__exp2 to <2 x i16> +; GCN: %3 = or <2 x i16> %__pow_sign, %2 +; GCN: %4 = bitcast <2 x i16> %3 to <2 x half> define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x) { %powr = tail call fast <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> ) ret <2 x half> %powr @@ -673,11 +666,11 @@ declare float @_Z5log10f(float) ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr -; GCN-NATIVE: %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4 -; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp) -; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1 -; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx) -; GCN-NATIVE: store float %__exp2, ptr addrspace(1) %a, align 4 +; GCN: %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4 +; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %tmp) +; GCN: %__ylogx = fmul fast float %tmp1, %__log2 +; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx) +; GCN: store float %__exp2, ptr addrspace(1) %a, align 4 define amdgpu_kernel void @test_use_native_powr(ptr addrspace(1) nocapture %a) { entry: %tmp = load float, ptr addrspace(1) %a, align 4