diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3377,6 +3377,15 @@ if (!Subtarget->hasMulI24() || VT.isVector()) return SDValue(); + // Don't generate 24-bit multiplies on values that are in SGPRs, since + // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs + // unnecessarily). isDivergent() is used as an approximation of whether the + // value is in an SGPR. + // This doesn't apply if no s_mul_hi is available (since we'll end up with a + // valu op anyway) + if (Subtarget->hasSMulHi() && !N->isDivergent()) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); @@ -3401,6 +3410,15 @@ if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32) return SDValue(); + // Don't generate 24-bit multiplies on values that are in SGPRs, since + // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs + // unnecessarily). isDivergent() is used as an approximation of whether the + // value is in an SGPR. + // This doesn't apply if no s_mul_hi is available (since we'll end up with a + // valu op anyway) + if (Subtarget->hasSMulHi() && !N->isDivergent()) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -54,6 +54,7 @@ bool HasVOP3PInsts; bool HasMulI24; bool HasMulU24; + bool HasSMulHi; bool HasInv2PiInlineImm; bool HasFminFmaxLegacy; bool EnablePromoteAlloca; @@ -161,6 +162,10 @@ return HasMulU24; } + bool hasSMulHi() const { + return HasSMulHi; + } + bool hasInv2PiInlineImm() const { return HasInv2PiInlineImm; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -163,6 +163,7 @@ WavefrontSizeLog2 = 5; HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; + HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; TargetID.setTargetIDFromFeaturesString(FS); @@ -185,6 +186,7 @@ HasVOP3PInsts(false), HasMulI24(true), HasMulU24(true), + HasSMulHi(false), HasInv2PiInlineImm(false), HasFminFmaxLegacy(true), EnablePromoteAlloca(false), diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC,SIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC,SIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC,GFX9 %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s @@ -22,10 +23,12 @@ } ; FUNC-LABEL: {{^}}test_smulhi24_i64: -; GCN-NOT: bfe +; SIVI-NOT: bfe ; GCN-NOT: ashr -; GCN: v_mul_hi_i32_i24_e32 [[RESULT:v[0-9]+]], -; GCN-NEXT: buffer_store_dword [[RESULT]] +; SIVI: v_mul_hi_i32_i24_e32 [[RESULT:v[0-9]+]], +; GFX9: s_mul_hi_i32 [[RES1:s[0-9]+]], +; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[RES1]] +; GCN: buffer_store_dword [[RESULT]] ; EG: ASHR ; EG: ASHR @@ -62,8 +65,10 @@ ; GCN-NOT: ashr -; GCN-DAG: v_mul_hi_i32_i24_e32 -; GCN-DAG: s_mul_i32 +; SIVI-DAG: v_mul_hi_i32_i24_e32 +; SIVI-DAG: s_mul_i32 +; GFX9-DAG: s_mul_hi_i32 +; GFX9-DAG: s_mul_i32 ; GCN: buffer_store_dwordx2 define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 { @@ -80,8 +85,11 @@ ; FUNC-LABEL: {{^}}test_smul24_i64_square: ; GCN: s_load_dword [[A:s[0-9]+]] -; GCN-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]] -; GCN-DAG: s_mul_i32 s{{[0-9]+}}, [[A]], [[A]] +; SIVI-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]] +; SIVI-DAG: s_mul_i32 s{{[0-9]+}}, [[A]], [[A]] +; GFX9: s_bfe_i32 [[B:s[0-9]+]], [[A]] +; GFX9-DAG: s_mul_hi_i32 s{{[0-9]+}}, [[B]], [[B]] +; GFX9-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]] ; GCN: buffer_store_dwordx2 define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { %shl.i = shl i32 %a, 8 @@ -99,14 +107,19 @@ ; GCN-NOT: and ; GCN-NOT: lshr -; GCN-DAG: s_mul_i32 -; GCN-DAG: v_mul_hi_i32_i24_e32 +; SIVI-DAG: s_mul_i32 +; SIVI-DAG: v_mul_hi_i32_i24_e32 ; SI: v_lshl_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 31 ; SI: v_ashr_i64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 31 ; VI: v_lshlrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}} ; VI: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}} +; GFX9-DAG: s_mul_i32 +; GFX9-DAG: s_mul_hi_i32 +; GFX9: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 31 +; GFX9: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 31 + ; GCN: buffer_store_dwordx2 define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 { entry: @@ -129,6 +142,11 @@ ; SI: v_mul_hi_i32_i24_e32 v[[MUL_HI:[0-9]+]], ; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] ; SI-NEXT: buffer_store_dword v[[HI]] + +; GFX9: s_mul_hi_i32 s[[MUL_HI:[0-9]+]], +; GFX9-NEXT: s_and_b32 s[[HI:[0-9]+]], s[[MUL_HI]], 1 +; GFX9-NEXT: v_mov_b32_e32 v[[RES:[0-9]+]], s[[HI]] +; GFX9-NEXT: buffer_store_dword v[[RES]] define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { entry: %tmp0 = shl i33 %a, 9