Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -524,12 +524,12 @@ def : DivergentBinOp; let SubtargetPredicate = HasAddNoCarryInsts in { - def : DivergentBinOp; + def : DivergentClampingBinOp; def : DivergentClampingBinOp; } let SubtargetPredicate = isGFX6GFX7GFX8GFX9, Predicates = [isGFX6GFX7GFX8GFX9] in { -def : DivergentBinOp; +def : DivergentClampingBinOp; def : DivergentClampingBinOp; def : DivergentBinOp; Index: test/CodeGen/AMDGPU/add.ll =================================================================== --- test/CodeGen/AMDGPU/add.ll +++ test/CodeGen/AMDGPU/add.ll @@ -1,11 +1,8 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}s_add_i32: -; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - ; GCN: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}} ; GCN: v_mov_b32_e32 v[[V_REG:[0-9]+]], s[[REG]] ; GCN: buffer_store_dword v[[V_REG]], @@ -19,9 +16,6 @@ } ; FUNC-LABEL: {{^}}s_add_v2i32: -; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { @@ -34,11 +28,6 @@ } ; FUNC-LABEL: {{^}}s_add_v4i32: -; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} @@ -53,15 +42,6 @@ } ; FUNC-LABEL: {{^}}s_add_v8i32: -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT - ; GCN: s_add_i32 ; GCN: s_add_i32 ; GCN: s_add_i32 @@ -78,23 +58,6 @@ } ; FUNC-LABEL: {{^}}s_add_v16i32: -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT - ; GCN: s_add_i32 ; GCN: s_add_i32 ; GCN: s_add_i32 @@ -124,7 +87,7 @@ ; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, [[A]], [[B]] ; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1 %a = load volatile i32, i32 addrspace(1)* %gep @@ -139,7 +102,7 @@ ; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, 0x7b, [[A]] ; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x7b, [[A]] define amdgpu_kernel void @v_add_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1 %a = load volatile i32, i32 addrspace(1)* %gep @@ -151,13 +114,6 @@ ; FUNC-LABEL: {{^}}add64: ; GCN: s_add_u32 ; GCN: s_addc_u32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] -; EG-DAG: ADD_INT {{[* ]*}} -; EG-DAG: ADDC_UINT -; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}} -; EG-NOT: SUB define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { entry: %add = add i64 %a, %b @@ -172,13 +128,6 @@ ; FUNC-LABEL: {{^}}add64_sgpr_vgpr: ; GCN-NOT: v_addc_u32_e32 s - -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] -; EG-DAG: ADD_INT {{[* ]*}} -; EG-DAG: ADDC_UINT -; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}} -; EG-NOT: SUB define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) { entry: %0 = load i64, i64 addrspace(1)* %in @@ -191,13 +140,6 @@ ; FUNC-LABEL: {{^}}add64_in_branch: ; GCN: s_add_u32 ; GCN: s_addc_u32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] -; EG-DAG: ADD_INT {{[* ]*}} -; EG-DAG: ADDC_UINT -; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}} -; EG-NOT: SUB define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { entry: %0 = icmp eq i64 %a, 0 @@ -217,7 +159,26 @@ ret void } -declare i32 @llvm.r600.read.tidig.x() #1 +; Make sure the VOP3 form of add is initially selected. Otherwise pair +; of opies from/to VCC would be necessary + +; GCN-LABEL: {{^}}add_select_vop3: +; SI: v_add_i32_e64 v0, s[0:1], s0, v0 +; VI: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX9: v_add_u32_e32 v0, s0, v0 + +; GCN: ; def vcc +; GCN: ds_write_b32 +; GCN: ; use vcc +define amdgpu_ps void @add_select_vop3(i32 inreg %s, i32 %v) { + %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"() + %sub = add i32 %v, %s + store i32 %sub, i32 addrspace(3)* undef + call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -280,6 +280,7 @@ ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_movk_i32 s13, 0x900 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: v_add_i32_e32 v7, vcc, 9, v1 @@ -297,7 +298,7 @@ ; SI-NEXT: v_or_b32_e32 v0, v6, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: v_and_b32_e32 v1, s12, v4 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s13, v0 ; SI-NEXT: v_or_b32_e32 v1, v5, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 Index: test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll =================================================================== --- test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll +++ test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll @@ -7,6 +7,8 @@ ; Function Attrs: nounwind ; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop: +; SI: s_movk_i32 [[K_0X88:s[0-9]+]], 0x +; SI: s_movk_i32 [[K_0X100:s[0-9]+]], 0x100 ; CHECK: BB0_1: ; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]], ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] @@ -14,9 +16,9 @@ ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR8]] ; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], vcc, 0x80, [[VADDR]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]] -; SI-DAG: v_add_i32_e32 [[VADDR0x88:v[0-9]+]], vcc, 0x88, [[VADDR]] +; SI-DAG: v_add_i32_e32 [[VADDR0x88:v[0-9]+]], vcc, [[K_0X88]], [[VADDR]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x88]] -; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, 0x100, [[VADDR]] +; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, [[K_0X100]], [[VADDR]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]] ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:2 Index: test/CodeGen/AMDGPU/fence-barrier.ll =================================================================== --- test/CodeGen/AMDGPU/fence-barrier.ll +++ test/CodeGen/AMDGPU/fence-barrier.ll @@ -54,7 +54,8 @@ } ; GCN-LABEL: {{^}}test_global -; GCN: v_add_u32_e32 v{{[0-9]+}}, vcc, 0x888, v{{[0-9]+}} +; GCN: s_movk_i32 [[K:s[0-9]+]], 0x888 +; GCN: v_add_u32_e32 v{{[0-9]+}}, vcc, [[K]], v{{[0-9]+}} ; GCN: flat_store_dword ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: s_barrier Index: test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -15,7 +15,8 @@ } ; VI-LABEL: {{^}}dpp_test1: -; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}} +; VI-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}} +; VI-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0 ; VI-NEXT: s_nop 0 ; VI-NEXT: s_nop 0 Index: test/CodeGen/AMDGPU/r600.add.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/r600.add.ll @@ -0,0 +1,167 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}s_add_i32: +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define amdgpu_kernel void @s_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = add i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_add_v2i32: +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1)* %in + %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr + %result = add <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_add_v4i32: +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define amdgpu_kernel void @s_add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1)* %in + %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr + %result = add <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_add_v8i32: +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +define amdgpu_kernel void @s_add_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) { +entry: + %0 = add <8 x i32> %a, %b + store <8 x i32> %0, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_add_v16i32: +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +define amdgpu_kernel void @s_add_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) { +entry: + %0 = add <16 x i32> %a, %b + store <16 x i32> %0, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_add_i32: +define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid + %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1 + %a = load volatile i32, i32 addrspace(1)* %gep + %b = load volatile i32, i32 addrspace(1)* %b_ptr + %result = add i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_add_imm_i32: +define amdgpu_kernel void @v_add_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid + %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1 + %a = load volatile i32, i32 addrspace(1)* %gep + %result = add i32 %a, 123 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}add64: +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] +; EG-DAG: ADD_INT {{[* ]*}} +; EG-DAG: ADDC_UINT +; EG-DAG: ADD_INT +; EG-DAG: ADD_INT {{[* ]*}} +; EG-NOT: SUB +define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %add = add i64 %a, %b + store i64 %add, i64 addrspace(1)* %out + ret void +} + +; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they +; use VCC. The test is designed so that %a will be stored in an SGPR and +; %0 will be stored in a VGPR, so the comiler will be forced to copy %a +; to a VGPR before doing the add. + +; FUNC-LABEL: {{^}}add64_sgpr_vgpr: +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] +; EG-DAG: ADD_INT {{[* ]*}} +; EG-DAG: ADDC_UINT +; EG-DAG: ADD_INT +; EG-DAG: ADD_INT {{[* ]*}} +; EG-NOT: SUB +define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) { +entry: + %0 = load i64, i64 addrspace(1)* %in + %1 = add i64 %a, %0 + store i64 %1, i64 addrspace(1)* %out + ret void +} + +; Test i64 add inside a branch. +; FUNC-LABEL: {{^}}add64_in_branch: +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] +; EG-DAG: ADD_INT {{[* ]*}} +; EG-DAG: ADDC_UINT +; EG-DAG: ADD_INT +; EG-DAG: ADD_INT {{[* ]*}} +; EG-NOT: SUB +define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64, i64 addrspace(1)* %in + br label %endif + +else: + %2 = add i64 %a, %b + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- test/CodeGen/AMDGPU/salu-to-valu.ll +++ test/CodeGen/AMDGPU/salu-to-valu.ll @@ -458,7 +458,7 @@ } ; GCN-LABEL: {{^}}phi_visit_order: -; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 1, v{{[0-9]+}} +; GCN: v_add_i32_e64 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 1, v{{[0-9]+}} define amdgpu_kernel void @phi_visit_order() { bb: br label %bb1