Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3337,7 +3337,10 @@ case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; - case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; + case AMDGPU::S_MUL_I32: + // TODO-GFX10: The docs say that V_MUL_LO_I32 exists but is deprecated, + // the emulator does not recognize it. + return ST.getGeneration() >= AMDGPUSubtarget::GFX10 ? AMDGPU::V_MUL_LO_U32 : AMDGPU::V_MUL_LO_I32; case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32; case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32; case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; Index: test/CodeGen/AMDGPU/mul.ll =================================================================== --- test/CodeGen/AMDGPU/mul.ll +++ test/CodeGen/AMDGPU/mul.ll @@ -1,6 +1,7 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,PRE-GFX10,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,PRE-GFX10,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=FUNC,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VIPLUS,GFX10,LO-U32,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=EG,FUNC %s ; mul24 and mad24 are affected @@ -9,8 +10,11 @@ ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; LO-I32: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; LO-I32: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; LO-U32: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; LO-U32: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 @@ -27,10 +31,15 @@ ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; LO-I32: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; LO-I32: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; LO-I32: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; LO-I32: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; LO-U32: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; LO-U32: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; LO-U32: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; LO-U32: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 @@ -56,7 +65,8 @@ ; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32: ; GCN: s_load_dword ; GCN: s_load_dword -; GCN: v_mul_lo_i32 +; LO-I32: v_mul_lo_i32 +; LO-U32: v_mul_lo_u32 ; GCN: buffer_store_dword define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %a = load i64, i64 addrspace(1)* %aptr, align 8 @@ -72,8 +82,10 @@ ; FUNC-LABEL: {{^}}mul64_sext_c: ; EG-DAG: MULLO_INT ; EG-DAG: MULHI_INT -; GCN-DAG: s_mul_i32 -; GCN-DAG: v_mul_hi_i32 +; PRE-GFX10-DAG: s_mul_i32 +; PRE-GFX10-DAG: v_mul_hi_i32 +; GFX10-DAG: s_mul_i32 +; GFX10-DAG: s_mul_hi_i32 define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) { entry: %0 = sext i32 %in to i64 @@ -85,7 +97,8 @@ ; FUNC-LABEL: {{^}}v_mul64_sext_c: ; EG-DAG: MULLO_INT ; EG-DAG: MULHI_INT -; GCN-DAG: v_mul_lo_i32 +; LO-I32-DAG: v_mul_lo_i32 +; LO-U32-DAG: v_mul_lo_u32 ; GCN-DAG: v_mul_hi_i32 ; GCN: s_endpgm define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { @@ -97,7 +110,8 @@ } ; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm: -; GCN-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 +; LO-I32-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 +; LO-U32-DAG: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, 9 ; GCN-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 ; GCN: s_endpgm define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { @@ -122,7 +136,8 @@ } ; FUNC-LABEL: {{^}}v_mul_i32: -; GCN: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; LO-I32: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; LO-U32: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in @@ -145,6 +160,11 @@ ; GFX9-DAG: s_mul_i32 ; GFX9-DAG: s_mul_i32 ; GFX9: s_endpgm +; GFX10-DAG: s_mul_i32 +; GFX10-DAG: s_mul_hi_u32 +; GFX10-DAG: s_mul_i32 +; GFX10-DAG: s_mul_i32 +; GFX10: s_endpgm define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %mul = mul i64 %a, %b store i64 %mul, i64 addrspace(1)* %out, align 8 @@ -152,7 +172,8 @@ } ; FUNC-LABEL: {{^}}v_mul_i64: -; GCN: v_mul_lo_i32 +; LO-I32: v_mul_lo_i32 +; LO-U32: v_mul_lo_u32 define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 @@ -183,8 +204,10 @@ } ; FUNC-LABEL: {{^}}mul64_in_branch: -; GCN-DAG: s_mul_i32 -; GCN-DAG: v_mul_hi_u32 +; PRE-GFX10-DAG: s_mul_i32 +; PRE-GFX10-DAG: v_mul_hi_u32 +; GFX10-DAG: s_mul_i32 +; GFX10-DAG: s_mul_hi_u32 ; GCN: s_endpgm define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { entry: @@ -238,6 +261,15 @@ ; VI: v_mad_u64_u32 ; VI: s_mul_i32 +; GFX10: s_mul_hi_u32 +; GFX10: s_mul_i32 +; GFX10: s_mul_hi_u32 +; GFX10: s_mul_i32 +; GFX10-DAG: v_mad_u64_u32 +; GFX10-DAG: s_mul_hi_u32 +; GFX10-DAG: v_mad_u64_u32 +; GFX10: s_mul_i32 +; GFX10: s_mul_i32 ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 { @@ -247,8 +279,8 @@ } ; FUNC-LABEL: {{^}}v_mul_i128: -; GCN: {{buffer|flat}}_load_dwordx4 -; GCN: {{buffer|flat}}_load_dwordx4 +; GCN: {{buffer|flat|global}}_load_dwordx4 +; GCN: {{buffer|flat|global}}_load_dwordx4 ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_mul_hi_u32 @@ -270,12 +302,12 @@ ; SI-DAG: v_mul_lo_i32 ; VI-DAG: v_mul_lo_i32 -; VI-DAG: v_mul_hi_u32 -; VI: v_mad_u64_u32 -; VI: v_mad_u64_u32 -; VI: v_mad_u64_u32 +; GFX10-DAG: v_mul_lo_u32 +; VIPLUS-DAG: v_mul_hi_u32 +; VIPLUS: v_mad_u64_u32 +; VIPLUS: v_mad_u64_u32 -; GCN: {{buffer|flat}}_store_dwordx4 +; GCN: {{buffer|flat|global}}_store_dwordx4 define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid