Index: llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -567,7 +567,8 @@ return true; } -static bool isKnownIntegral(const Value *V) { +static bool isKnownIntegral(const Value *V, const DataLayout &DL, + FastMathFlags FMF) { if (isa(V)) return true; @@ -587,6 +588,24 @@ return true; } + const Instruction *I = dyn_cast(V); + if (!I) + return false; + + switch (I->getOpcode()) { + case Instruction::SIToFP: + case Instruction::UIToFP: + // TODO: Could check nofpclass(inf) on incoming argument + if (FMF.noInfs()) + return true; + + // Need to check int size cannot produce infinity, which computeKnownFPClass + // knows how to do already. + return isKnownNeverInfinity(I, DL); + default: + break; + } + return false; } @@ -1013,7 +1032,7 @@ if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) { // We cannot handle corner cases for a general pow() function, give up // unless y is a constant integral value. Then proceed as if it were pown. - if (!isKnownIntegral(opr1)) + if (!isKnownIntegral(opr1, M->getDataLayout(), FPOp->getFastMathFlags())) return false; } Index: llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -57,12 +57,95 @@ ; CHECK-LABEL: test_pow_fast_f16__integral_y: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z3powDhDh@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powDhDh@rel32@hi+12 -; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CHECK-NEXT: s_setpc_b64 s[16:17] +; CHECK-NEXT: s_mov_b32 s16, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: v_writelane_b32 v40, s16, 14 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: s_addk_i32 s32, 0x800 +; CHECK-NEXT: v_writelane_b32 v40, s40, 8 +; CHECK-NEXT: v_writelane_b32 v40, s41, 9 +; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2Dh@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2Dh@gotpcrel32@hi+12 +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v42, v0 +; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v1 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; CHECK-NEXT: v_writelane_b32 v40, s42, 10 +; CHECK-NEXT: v_writelane_b32 v40, s43, 11 +; CHECK-NEXT: v_writelane_b32 v40, s44, 12 +; CHECK-NEXT: v_cvt_f16_f32_e32 v43, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v42 +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: v_writelane_b32 v40, s45, 13 +; CHECK-NEXT: v_mov_b32_e32 v41, v31 +; CHECK-NEXT: s_mov_b32 s42, s15 +; CHECK-NEXT: s_mov_b32 s43, s14 +; CHECK-NEXT: s_mov_b32 s44, s13 +; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2Dh@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2Dh@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; CHECK-NEXT: v_mul_f16_e32 v0, v0, v43 +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s45 +; CHECK-NEXT: s_mov_b32 s13, s44 +; CHECK-NEXT: s_mov_b32 s14, s43 +; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_cvt_i16_f16_e32 v1, v43 +; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1 +; CHECK-NEXT: v_and_b32_e32 v1, v1, v42 +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: v_or_b32_e32 v0, v1, v0 +; CHECK-NEXT: v_readlane_b32 s45, v40, 13 +; CHECK-NEXT: v_readlane_b32 s44, v40, 12 +; CHECK-NEXT: v_readlane_b32 s43, v40, 11 +; CHECK-NEXT: v_readlane_b32 s42, v40, 10 +; CHECK-NEXT: v_readlane_b32 s41, v40, 9 +; CHECK-NEXT: v_readlane_b32 s40, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s4, v40, 14 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_addk_i32 s32, 0xf800 +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] %y = sitofp i32 %y.i to half %pow = tail call fast half @_Z3powDhDh(half %x, half %y) ret half %pow @@ -72,11 +155,93 @@ ; CHECK-LABEL: test_pow_fast_f32__integral_y: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z3powff@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powff@rel32@hi+12 -; CHECK-NEXT: s_setpc_b64 s[16:17] +; CHECK-NEXT: s_mov_b32 s16, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: v_writelane_b32 v40, s16, 14 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: s_addk_i32 s32, 0x800 +; CHECK-NEXT: v_writelane_b32 v40, s40, 8 +; CHECK-NEXT: v_writelane_b32 v40, s41, 9 +; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2f@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2f@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; CHECK-NEXT: v_writelane_b32 v40, s42, 10 +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_writelane_b32 v40, s43, 11 +; CHECK-NEXT: v_mov_b32_e32 v42, v0 +; CHECK-NEXT: v_writelane_b32 v40, s44, 12 +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v42 +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: v_writelane_b32 v40, s45, 13 +; CHECK-NEXT: v_mov_b32_e32 v41, v31 +; CHECK-NEXT: s_mov_b32 s42, s15 +; CHECK-NEXT: s_mov_b32 s43, s14 +; CHECK-NEXT: s_mov_b32 s44, s13 +; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: v_cvt_f32_i32_e32 v43, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2f@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2f@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; CHECK-NEXT: v_mul_f32_e32 v0, v0, v43 +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s45 +; CHECK-NEXT: s_mov_b32 s13, s44 +; CHECK-NEXT: s_mov_b32 s14, s43 +; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v43 +; CHECK-NEXT: v_readlane_b32 s45, v40, 13 +; CHECK-NEXT: v_readlane_b32 s44, v40, 12 +; CHECK-NEXT: v_readlane_b32 s43, v40, 11 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; CHECK-NEXT: v_and_or_b32 v0, v1, v42, v0 +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: v_readlane_b32 s42, v40, 10 +; CHECK-NEXT: v_readlane_b32 s41, v40, 9 +; CHECK-NEXT: v_readlane_b32 s40, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s4, v40, 14 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_addk_i32 s32, 0xf800 +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] %y = sitofp i32 %y.i to float %pow = tail call fast float @_Z3powff(float %x, float %y) ret float %pow @@ -86,11 +251,98 @@ ; CHECK-LABEL: test_pow_fast_f64__integral_y: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v2 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z3powdd@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powdd@rel32@hi+12 -; CHECK-NEXT: s_setpc_b64 s[16:17] +; CHECK-NEXT: s_mov_b32 s16, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: v_writelane_b32 v40, s16, 14 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: s_addk_i32 s32, 0x800 +; CHECK-NEXT: v_writelane_b32 v40, s40, 8 +; CHECK-NEXT: v_writelane_b32 v40, s41, 9 +; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; CHECK-NEXT: v_writelane_b32 v40, s42, 10 +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_writelane_b32 v40, s43, 11 +; CHECK-NEXT: v_mov_b32_e32 v43, v1 +; CHECK-NEXT: v_writelane_b32 v40, s44, 12 +; CHECK-NEXT: v_mov_b32_e32 v42, v2 +; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v43 +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: v_writelane_b32 v40, s45, 13 +; CHECK-NEXT: v_mov_b32_e32 v41, v31 +; CHECK-NEXT: s_mov_b32 s42, s15 +; CHECK-NEXT: s_mov_b32 s43, s14 +; CHECK-NEXT: s_mov_b32 s44, s13 +; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: v_cvt_f64_i32_e32 v[44:45], v42 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[44:45] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s45 +; CHECK-NEXT: s_mov_b32 s13, s44 +; CHECK-NEXT: s_mov_b32 s14, s43 +; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v42 +; CHECK-NEXT: v_and_b32_e32 v2, v2, v43 +; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 +; CHECK-NEXT: v_readlane_b32 s45, v40, 13 +; CHECK-NEXT: v_readlane_b32 s44, v40, 12 +; CHECK-NEXT: v_readlane_b32 s43, v40, 11 +; CHECK-NEXT: v_readlane_b32 s42, v40, 10 +; CHECK-NEXT: v_readlane_b32 s41, v40, 9 +; CHECK-NEXT: v_readlane_b32 s40, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s4, v40, 14 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_addk_i32 s32, 0xf800 +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] %y = sitofp i32 %y.i to double %pow = tail call fast double @_Z3powdd(double %x, double %y) ret double %pow Index: llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll @@ -2208,8 +2208,18 @@ ; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp i32 [[Y]] to float -; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn float @_Z3powff(float [[X]], float [[Y_CAST]]) -; CHECK-NEXT: ret float [[POW]] +; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]]) +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]] +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32 +; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32 +; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[__EXP2]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[__POW_SIGN]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float +; CHECK-NEXT: ret float [[TMP4]] ; %y.cast = sitofp i32 %y to float %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast) @@ -2280,8 +2290,18 @@ ; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp i32 [[Y]] to float -; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn float @_Z3powff(float [[X]], float [[Y_CAST]]) -; CHECK-NEXT: ret float [[POW]] +; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]]) +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]] +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32 +; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32 +; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[__EXP2]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[__POW_SIGN]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float +; CHECK-NEXT: ret float [[TMP4]] ; %y.cast = uitofp i32 %y to float %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast) @@ -2318,8 +2338,18 @@ ; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256 ; CHECK-SAME: (float [[X:%.*]], i256 [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp i256 [[Y]] to float -; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn float @_Z3powff(float [[X]], float [[Y_CAST]]) -; CHECK-NEXT: ret float [[POW]] +; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]]) +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]] +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32 +; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32 +; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[__EXP2]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[__POW_SIGN]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float +; CHECK-NEXT: ret float [[TMP4]] ; %y.cast = uitofp i256 %y to float %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast) @@ -2330,8 +2360,18 @@ ; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp_i256 ; CHECK-SAME: (float [[X:%.*]], i256 [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp i256 [[Y]] to float -; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn float @_Z3powff(float [[X]], float [[Y_CAST]]) -; CHECK-NEXT: ret float [[POW]] +; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]]) +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]] +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]]) +; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32 +; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32 +; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[__EXP2]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[__POW_SIGN]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float +; CHECK-NEXT: ret float [[TMP4]] ; %y.cast = sitofp i256 %y to float %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast) @@ -2342,8 +2382,18 @@ ; CHECK-LABEL: define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_sitofp ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp <2 x i32> [[Y]] to <2 x float> -; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn <2 x float> @_Z3powDv2_fS_(<2 x float> [[X]], <2 x float> [[Y_CAST]]) -; CHECK-NEXT: ret <2 x float> [[POW]] +; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]]) +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]] +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]]) +; CHECK-NEXT: [[__YTOU:%.*]] = fptosi <2 x float> [[Y_CAST]] to <2 x i32> +; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[__YTOU]], +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[X]] to <2 x i32> +; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[__POW_SIGN]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float> +; CHECK-NEXT: ret <2 x float> [[TMP4]] ; %y.cast = sitofp <2 x i32> %y to <2 x float> %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast) @@ -2378,8 +2428,18 @@ ; CHECK-LABEL: define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_uitofp ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) { ; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp <2 x i32> [[Y]] to <2 x float> -; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn <2 x float> @_Z3powDv2_fS_(<2 x float> [[X]], <2 x float> [[Y_CAST]]) -; CHECK-NEXT: ret <2 x float> [[POW]] +; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]]) +; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]]) +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]] +; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]]) +; CHECK-NEXT: [[__YTOU:%.*]] = fptosi <2 x float> [[Y_CAST]] to <2 x i32> +; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[__YTOU]], +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[X]] to <2 x i32> +; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[__POW_SIGN]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float> +; CHECK-NEXT: ret <2 x float> [[TMP4]] ; %y.cast = uitofp <2 x i32> %y to <2 x float> %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast)