Index: test/CodeGen/AMDGPU/cf-loop-on-constant.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -0,0 +1,121 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s + +; GCN-LABEL: {{^}}test_loop: +; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: +; GCN: ds_read_b32 +; GCN: ds_write_b32 +; GCN: s_branch [[LABEL]] +; GCN: s_endpgm +define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind { +entry: + %cmp = icmp eq i32 %n, -1 + br i1 %cmp, label %for.exit, label %for.body + +for.exit: + ret void + +for.body: + %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %tmp = add i32 %indvar, 32 + %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp + %vecload = load float, float addrspace(3)* %arrayidx, align 4 + %add = fadd float %vecload, 1.0 + store float %add, float addrspace(3)* %arrayidx, align 8 + %inc = add i32 %indvar, 1 + br label %for.body +} + +; GCN-LABEL: @loop_const_true +; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: +; GCN: ds_read_b32 +; GCN: ds_write_b32 +; GCN: s_branch [[LABEL]] +define void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind { +entry: + br label %for.body + +for.exit: + ret void + +for.body: + %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %tmp = add i32 %indvar, 32 + %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp + %vecload = load float, float addrspace(3)* %arrayidx, align 4 + %add = fadd float %vecload, 1.0 + store float %add, float addrspace(3)* %arrayidx, align 8 + %inc = add i32 %indvar, 1 + br i1 true, label %for.body, label %for.exit +} + +; GCN-LABEL: {{^}}loop_const_false: +; GCN-NOT: s_branch +; GCN: s_endpgm +define void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind { +entry: + br label %for.body + +for.exit: + ret void + +; XXX - Should there be an S_ENDPGM? +for.body: + %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %tmp = add i32 %indvar, 32 + %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp + %vecload = load float, float addrspace(3)* %arrayidx, align 4 + %add = fadd float %vecload, 1.0 + store float %add, float addrspace(3)* %arrayidx, align 8 + %inc = add i32 %indvar, 1 + br i1 false, label %for.body, label %for.exit +} + +; GCN-LABEL: {{^}}loop_const_undef: +; GCN-NOT: s_branch +; GCN: s_endpgm +define void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind { +entry: + br label %for.body + +for.exit: + ret void + +; XXX - Should there be an s_endpgm? +for.body: + %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %tmp = add i32 %indvar, 32 + %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp + %vecload = load float, float addrspace(3)* %arrayidx, align 4 + %add = fadd float %vecload, 1.0 + store float %add, float addrspace(3)* %arrayidx, align 8 + %inc = add i32 %indvar, 1 + br i1 undef, label %for.body, label %for.exit +} + +; GCN-LABEL: {{^}}loop_arg_0: +; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; GCN: v_cmp_eq_i32_e32 vcc, 1, + +; GCN: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, exec, vcc +; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]] +; GCN: s_cbranch_vccnz [[LOOPBB]] +; GCN-NEXT: ; BB#2 +; GCN-NEXT: s_endpgm +define void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind { +entry: + br label %for.body + +for.exit: + ret void + +for.body: + %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %tmp = add i32 %indvar, 32 + %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp + %vecload = load float, float addrspace(3)* %arrayidx, align 4 + %add = fadd float %vecload, 1.0 + store float %add, float addrspace(3)* %arrayidx, align 8 + %inc = add i32 %indvar, 1 + br i1 %cond, label %for.body, label %for.exit +} Index: test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll @@ -0,0 +1,126 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}store_build_vector_multiple_uses_v4i32: +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword + +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 + +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0, + <4 x i32> addrspace(1)* noalias %out1, + i32 addrspace(1)* noalias %out2, + i32 addrspace(1)* %in) { + %elt0 = load volatile i32, i32 addrspace(1)* %in + %elt1 = load volatile i32, i32 addrspace(1)* %in + %elt2 = load volatile i32, i32 addrspace(1)* %in + %elt3 = load volatile i32, i32 addrspace(1)* %in + + %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0 + %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1 + %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2 + %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3 + + store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out0 + store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out1 + + %extract0 = extractelement <4 x i32> %vec3, i32 0 + %extract1 = extractelement <4 x i32> %vec3, i32 1 + %extract2 = extractelement <4 x i32> %vec3, i32 2 + %extract3 = extractelement <4 x i32> %vec3, i32 3 + + store volatile i32 %extract0, i32 addrspace(1)* %out2 + store volatile i32 %extract1, i32 addrspace(1)* %out2 + store volatile i32 %extract2, i32 addrspace(1)* %out2 + store volatile i32 %extract3, i32 addrspace(1)* %out2 + + ret void +} + +; GCN-LABEL: {{^}}store_build_vector_multiple_extract_uses_v4i32: +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword + +; GCN: buffer_store_dwordx4 + +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0, + <4 x i32> addrspace(1)* noalias %out1, + i32 addrspace(1)* noalias %out2, + i32 addrspace(1)* %in) { + %elt0 = load volatile i32, i32 addrspace(1)* %in + %elt1 = load volatile i32, i32 addrspace(1)* %in + %elt2 = load volatile i32, i32 addrspace(1)* %in + %elt3 = load volatile i32, i32 addrspace(1)* %in + + %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0 + %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1 + %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2 + %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3 + + %extract0 = extractelement <4 x i32> %vec3, i32 0 + %extract1 = extractelement <4 x i32> %vec3, i32 1 + %extract2 = extractelement <4 x i32> %vec3, i32 2 + %extract3 = extractelement <4 x i32> %vec3, i32 3 + + %op0 = add i32 %extract0, 3 + %op1 = sub i32 %extract1, 9 + %op2 = xor i32 %extract2, 1231412 + %op3 = and i32 %extract3, 258233412312 + + store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out0 + + store volatile i32 %op0, i32 addrspace(1)* %out2 + store volatile i32 %op1, i32 addrspace(1)* %out2 + store volatile i32 %op2, i32 addrspace(1)* %out2 + store volatile i32 %op3, i32 addrspace(1)* %out2 + + ret void +} + +; GCN-LABEL: {{^}}store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64: +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword + +; GCN: buffer_store_dwordx4 + +; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx2 +define void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0, + <4 x i32> addrspace(1)* noalias %out1, + i64 addrspace(1)* noalias %out2, + i32 addrspace(1)* %in) { + %elt0 = load volatile i32, i32 addrspace(1)* %in + %elt1 = load volatile i32, i32 addrspace(1)* %in + %elt2 = load volatile i32, i32 addrspace(1)* %in + %elt3 = load volatile i32, i32 addrspace(1)* %in + + %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0 + %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1 + %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2 + %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3 + + %bc.vec3 = bitcast <4 x i32> %vec3 to <2 x i64> + store <2 x i64> %bc.vec3, <2 x i64> addrspace(1)* %out0 + + %extract0 = extractelement <2 x i64> %bc.vec3, i32 0 + %extract1 = extractelement <2 x i64> %bc.vec3, i32 1 + + store volatile i64 %extract0, i64 addrspace(1)* %out2 + store volatile i64 %extract1, i64 addrspace(1)* %out2 + + ret void +} Index: test/CodeGen/AMDGPU/mad24-get-global-id.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/mad24-get-global-id.ll @@ -0,0 +1,36 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; If the workgroup id range is restricted, we should be able to use +; mad24 for the usual indexing pattern. + +declare i32 @llvm.amdgcn.workgroup.id.x() #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + +; GCN-LABEL: {{^}}get_global_id_0: +; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff +; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]] +; GCN: v_mad_u32_u24 v{{[0-9]+}}, [[VWGSIZEX]], s8, v0 +define void @get_global_id_0(i32 addrspace(1)* %out) #1 { + %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* + %gep = getelementptr inbounds i32, i32 addrspace(2)* %cast.dispatch.ptr, i64 1 + %workgroup.size.xy = load i32, i32 addrspace(2)* %gep, align 4, !invariant.load !0 + %workgroup.size.x = and i32 %workgroup.size.xy, 65535 + + %workitem.id.x = call i32 @llvm.amdgcn.workitem.id.x(), !range !1 + %workgroup.id.x = call i32 @llvm.amdgcn.workgroup.id.x(), !range !2 + + %mul = mul i32 %workgroup.id.x, %workgroup.size.x + %add = add i32 %mul, %workitem.id.x + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } + +!0 = !{} +!1 = !{i32 0, i32 1024} +!2 = !{i32 0, i32 16777216} Index: test/CodeGen/AMDGPU/no-shrink-extloads.ll =================================================================== --- test/CodeGen/AMDGPU/no-shrink-extloads.ll +++ test/CodeGen/AMDGPU/no-shrink-extloads.ll @@ -201,3 +201,15 @@ store i32 %mask, i32 addrspace(1)* %out ret void } + +; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32: +; SI: buffer_load_dword v +; SI: buffer_store_dword v +define void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { + %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in + %bc = bitcast <2 x i32> %ld to i64 + %hi = lshr i64 %bc, 32 + %trunc = trunc i64 %hi to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll +++ /dev/null @@ -1,25 +0,0 @@ -; REQUIRES: asserts -; XFAIL: * -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s - - -define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind { -; CHECK-LABEL: {{^}}test: - -entry: - switch i32 %x, label %sw.default [ - i32 0, label %sw.bb - i32 60, label %sw.bb - ] - -sw.bb: - unreachable - -sw.default: - unreachable - -sw.epilog: - ret void -} - Index: test/CodeGen/AMDGPU/si-annotate-cf.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cf.ll +++ test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -64,6 +64,87 @@ ret void } +; FIXME: should emit s_endpgm +; CHECK-LABEL: {{^}}switch_unreachable: +; CHECK-NOT: s_endpgm +; CHECK: .Lfunc_end2 +define void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind { +centry: + switch i32 %x, label %sw.default [ + i32 0, label %sw.bb + i32 60, label %sw.bb + ] + +sw.bb: + unreachable + +sw.default: + unreachable + +sw.epilog: + ret void +} + +declare float @llvm.fabs.f32(float) nounwind readnone + +; This broke the old AMDIL cfg structurizer +; FUNC-LABEL: {{^}}loop_land_info_assert: +; SI: s_cmp_gt_i32 +; SI-NEXT: s_cbranch_scc0 [[ENDPGM:BB[0-9]+_[0-9]+]] + +; SI: s_cmp_gt_i32 +; SI-NEXT: s_cbranch_scc1 [[ENDPGM]] + +; SI: [[INFLOOP:BB[0-9]+_[0-9]+]] +; SI: s_branch [[INFLOOP]] + +; SI: [[ENDPGM]]: +; SI: s_endpgm +define void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind { +entry: + %cmp = icmp sgt i32 %c0, 0 + br label %while.cond.outer + +while.cond.outer: + %tmp = load float, float addrspace(1)* undef + br label %while.cond + +while.cond: + %cmp1 = icmp slt i32 %c1, 4 + br i1 %cmp1, label %convex.exit, label %for.cond + +convex.exit: + %or = or i1 %cmp, %cmp1 + br i1 %or, label %return, label %if.end + +if.end: + %tmp3 = call float @llvm.fabs.f32(float %tmp) nounwind readnone + %cmp2 = fcmp olt float %tmp3, 0x3E80000000000000 + br i1 %cmp2, label %if.else, label %while.cond.outer + +if.else: + store volatile i32 3, i32 addrspace(1)* undef, align 4 + br label %while.cond + +for.cond: + %cmp3 = icmp slt i32 %c3, 1000 + br i1 %cmp3, label %for.body, label %return + +for.body: + br i1 %cmp3, label %self.loop, label %if.end.2 + +if.end.2: + %or.cond2 = or i1 %cmp3, %arg + br i1 %or.cond2, label %return, label %for.cond + +self.loop: + br label %self.loop + +return: + ret void +} + + declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll +++ test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=kaveri < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test: ; CHECK s_and_saveexec_b64 @@ -6,7 +6,7 @@ ; CHECK s_or_b64 exec, exec ; CHECK s_andn2_b64 exec, exec ; CHECK s_cbranch_execnz -define spir_kernel void @test(i32 %arg, i32 %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6) { +define void @test(i32 %arg, i32 %arg1) { bb: %tmp = icmp ne i32 %arg, 0 %tmp7 = icmp ne i32 %arg1, 0 Index: test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll =================================================================== --- test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll +++ test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll @@ -1,8 +1,6 @@ -; REQUIRES: asserts -; XFAIL: * -; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s +; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s ; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s -; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s +; XUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable. @@ -41,8 +39,9 @@ } ; COMMON-LABEL: {{^}}branch_false: -; SI: .text -; SI-NEXT: s_endpgm +; SI: s_cbranch_vccnz +; SI: s_cbranch_vccnz +; SI: s_endpgm define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { entry: br i1 false, label %for.end, label %for.body.lr.ph @@ -77,8 +76,9 @@ } ; COMMON-LABEL: {{^}}branch_undef: -; SI: .text -; SI-NEXT: s_endpgm +; SI: s_cbranch_vccnz +; SI: s_cbranch_vccnz +; SI: s_endpgm define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { entry: br i1 undef, label %for.end, label %for.body.lr.ph