Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -587,6 +587,7 @@ } void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { + addPass(createLICMPass()); addPass(createSeparateConstOffsetFromGEPPass()); addPass(createSpeculativeExecutionPass()); // ReassociateGEPs exposes more opportunites for SLSR. See Index: test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll =================================================================== --- test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll +++ test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll @@ -24,7 +24,7 @@ .endls: ; preds = %.beginls, %.entry %.fca.2.gep120.i = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] addrspace(5)* %__llpc_global_proxy_7.i, i64 0, i64 2 - store <4 x float> , <4 x float> addrspace(5)* %.fca.2.gep120.i, align 16 + store volatile <4 x float> , <4 x float> addrspace(5)* %.fca.2.gep120.i, align 16 br label %bb bb: ; preds = %bb, %.endls Index: test/CodeGen/AMDGPU/collapse-endcf.ll =================================================================== --- test/CodeGen/AMDGPU/collapse-endcf.ll +++ test/CodeGen/AMDGPU/collapse-endcf.ll @@ -210,7 +210,7 @@ ; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]] ; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN: s_and_b64 exec, exec, vcc +; GCN: s_and_b64 exec, exec, {{vcc|s\[[0-9:]+\]}} ; GCN-NOT: s_or_b64 exec, exec Index: test/CodeGen/AMDGPU/i1-copy-from-loop.ll =================================================================== --- test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -7,11 +7,13 @@ ; only contain the lanes that were active during the last loop iteration. ; ; SI: ; %for.body -; SI: v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4, -; SI: v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]] -; SI-NOT: [[VREG]] -; SI: ; %for.end -; SI: v_cmp_ne_u32_e32 vcc, 0, [[VREG]] +; SI: v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4, +; SI: v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]] +; SI-NEXT: s_cbranch_vccnz [[ENDIF:BB[0-9_]+]] +; SI: [[ENDIF]]: +; SI-NOT: [[VREG]] +; SI: ; %for.end +; SI: v_cmp_ne_u32_e32 vcc, 0, [[VREG]] define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) { entry: br label %for.body Index: test/CodeGen/AMDGPU/idiv-licm.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/idiv-licm.ll @@ -0,0 +1,249 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; GCN-LABEL: {{^}}udiv32_invariant_denom: +; GCN: v_cvt_f32_u32 +; GCN: v_rcp_iflag_f32 +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000, +; GCN: v_cvt_u32_f32_e32 +; GCN-DAG: v_mul_hi_u32 +; GCN-DAG: v_mul_lo_i32 +; GCN-DAG: v_sub_i32_e32 +; GCN-DAG: v_cmp_eq_u32_e64 +; GCN-DAG: v_cndmask_b32_e64 +; GCN-DAG: v_mul_hi_u32 +; GCN-DAG: v_add_i32_e32 +; GCN-DAG: v_subrev_i32_e32 +; GCN-DAG: v_cndmask_b32_e64 +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_rcp +; GCN: s_cbranch_scc0 [[LOOP]] +; GCN: s_endpgm +define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ] + %tmp4 = udiv i32 %tmp, %arg1 + %tmp5 = zext i32 %tmp to i64 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5 + store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4 + %tmp7 = add nuw nsw i32 %tmp, 1 + %tmp8 = icmp eq i32 %tmp7, 1024 + br i1 %tmp8, label %bb2, label %bb3 +} + +; GCN-LABEL: {{^}}urem32_invariant_denom: +; GCN: v_cvt_f32_u32 +; GCN: v_rcp_iflag_f32 +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000, +; GCN: v_cvt_u32_f32_e32 +; GCN-DAG: v_mul_hi_u32 +; GCN-DAG: v_mul_lo_i32 +; GCN-DAG: v_sub_i32_e32 +; GCN-DAG: v_cmp_eq_u32_e64 +; GCN-DAG: v_cndmask_b32_e64 +; GCN-DAG: v_mul_hi_u32 +; GCN-DAG: v_add_i32_e32 +; GCN-DAG: v_subrev_i32_e32 +; GCN-DAG: v_cndmask_b32_e64 +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_rcp +; GCN: s_cbranch_scc0 [[LOOP]] +; GCN: s_endpgm +define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ] + %tmp4 = urem i32 %tmp, %arg1 + %tmp5 = zext i32 %tmp to i64 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5 + store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4 + %tmp7 = add nuw nsw i32 %tmp, 1 + %tmp8 = icmp eq i32 %tmp7, 1024 + br i1 %tmp8, label %bb2, label %bb3 +} + +; GCN-LABEL: {{^}}sdiv32_invariant_denom: +; GCN: v_cvt_f32_u32 +; GCN: v_rcp_iflag_f32 +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000, +; GCN: v_cvt_u32_f32_e32 +; GCN-DAG: v_mul_hi_u32 +; GCN-DAG: v_mul_lo_i32 +; GCN-DAG: v_sub_i32_e32 +; GCN-DAG: v_cmp_eq_u32_e64 +; GCN-DAG: v_cndmask_b32_e64 +; GCN-DAG: v_mul_hi_u32 +; GCN-DAG: v_add_i32_e32 +; GCN-DAG: v_subrev_i32_e32 +; GCN-DAG: v_cndmask_b32_e64 +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_rcp +; GCN: s_cbranch_scc0 [[LOOP]] +; GCN: s_endpgm +define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ] + %tmp4 = sdiv i32 %tmp, %arg1 + %tmp5 = zext i32 %tmp to i64 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5 + store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4 + %tmp7 = add nuw nsw i32 %tmp, 1 + %tmp8 = icmp eq i32 %tmp7, 1024 + br i1 %tmp8, label %bb2, label %bb3 +} + +; GCN-LABEL: {{^}}srem32_invariant_denom: +; GCN: v_cvt_f32_u32 +; GCN: v_rcp_iflag_f32 +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000, +; GCN: v_cvt_u32_f32_e32 +; GCN-DAG: v_mul_hi_u32 +; GCN-DAG: v_mul_lo_i32 +; GCN-DAG: v_sub_i32_e32 +; GCN-DAG: v_cmp_eq_u32_e64 +; GCN-DAG: v_cndmask_b32_e64 +; GCN-DAG: v_mul_hi_u32 +; GCN-DAG: v_add_i32_e32 +; GCN-DAG: v_subrev_i32_e32 +; GCN-DAG: v_cndmask_b32_e64 +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_rcp +; GCN: s_cbranch_scc0 [[LOOP]] +; GCN: s_endpgm +define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ] + %tmp4 = srem i32 %tmp, %arg1 + %tmp5 = zext i32 %tmp to i64 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5 + store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4 + %tmp7 = add nuw nsw i32 %tmp, 1 + %tmp8 = icmp eq i32 %tmp7, 1024 + br i1 %tmp8, label %bb2, label %bb3 +} + +; GCN-LABEL: {{^}}udiv16_invariant_denom: +; GCN: v_cvt_f32_u32 +; GCN: v_rcp_iflag_f32 +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_rcp +; GCN: s_cbranch_scc0 [[LOOP]] +; GCN: s_endpgm +define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) { +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ] + %tmp4 = udiv i16 %tmp, %arg1 + %tmp5 = zext i16 %tmp to i64 + %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5 + store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2 + %tmp7 = add nuw nsw i16 %tmp, 1 + %tmp8 = icmp eq i16 %tmp7, 1024 + br i1 %tmp8, label %bb2, label %bb3 +} + +; GCN-LABEL: {{^}}urem16_invariant_denom: +; GCN: v_cvt_f32_u32 +; GCN: v_rcp_iflag_f32 +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_rcp +; GCN: s_cbranch_scc0 [[LOOP]] +; GCN: s_endpgm +define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) { +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ] + %tmp4 = urem i16 %tmp, %arg1 + %tmp5 = zext i16 %tmp to i64 + %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5 + store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2 + %tmp7 = add nuw nsw i16 %tmp, 1 + %tmp8 = icmp eq i16 %tmp7, 1024 + br i1 %tmp8, label %bb2, label %bb3 +} + +; GCN-LABEL: {{^}}sdiv16_invariant_denom: +; GCN-DAG: s_sext_i32_i16 +; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fffffff +; GCN-DAG: v_cvt_f32_i32 +; GCN-DAG: v_rcp_iflag_f32 +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_rcp +; GCN: s_cbranch_scc0 [[LOOP]] +; GCN: s_endpgm +define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) { +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ] + %tmp4 = sdiv i16 %tmp, %arg1 + %tmp5 = zext i16 %tmp to i64 + %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5 + store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2 + %tmp7 = add nuw nsw i16 %tmp, 1 + %tmp8 = icmp eq i16 %tmp7, 1024 + br i1 %tmp8, label %bb2, label %bb3 +} + +; GCN-LABEL: {{^}}srem16_invariant_denom: +; GCN-DAG: s_sext_i32_i16 +; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fffffff +; GCN-DAG: v_cvt_f32_i32 +; GCN-DAG: v_rcp_iflag_f32 +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_rcp +; GCN: s_cbranch_scc0 [[LOOP]] +; GCN: s_endpgm +define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) { +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ] + %tmp4 = srem i16 %tmp, %arg1 + %tmp5 = zext i16 %tmp to i64 + %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5 + store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2 + %tmp7 = add nuw nsw i16 %tmp, 1 + %tmp8 = icmp eq i16 %tmp7, 1024 + br i1 %tmp8, label %bb2, label %bb3 +} Index: test/CodeGen/AMDGPU/infinite-loop.ll =================================================================== --- test/CodeGen/AMDGPU/infinite-loop.ll +++ test/CodeGen/AMDGPU/infinite-loop.ll @@ -12,7 +12,7 @@ br label %loop loop: - store i32 999, i32 addrspace(1)* %out, align 4 + store volatile i32 999, i32 addrspace(1)* %out, align 4 br label %loop } @@ -21,7 +21,7 @@ ; IR: br i1 %cond, label %loop, label %UnifiedReturnBlock ; IR: loop: -; IR: store i32 999, i32 addrspace(1)* %out, align 4 +; IR: store volatile i32 999, i32 addrspace(1)* %out, align 4 ; IR: br i1 true, label %loop, label %UnifiedReturnBlock ; IR: UnifiedReturnBlock: @@ -47,7 +47,7 @@ br i1 %cond, label %loop, label %return loop: - store i32 999, i32 addrspace(1)* %out, align 4 + store volatile i32 999, i32 addrspace(1)* %out, align 4 br label %loop return: @@ -59,11 +59,11 @@ ; IR: br i1 undef, label %loop1, label %loop2 ; IR: loop1: -; IR: store i32 999, i32 addrspace(1)* %out, align 4 +; IR: store volatile i32 999, i32 addrspace(1)* %out, align 4 ; IR: br i1 true, label %loop1, label %DummyReturnBlock ; IR: loop2: -; IR: store i32 888, i32 addrspace(1)* %out, align 4 +; IR: store volatile i32 888, i32 addrspace(1)* %out, align 4 ; IR: br i1 true, label %loop2, label %DummyReturnBlock ; IR: DummyReturnBlock: @@ -96,11 +96,11 @@ br i1 undef, label %loop1, label %loop2 loop1: - store i32 999, i32 addrspace(1)* %out, align 4 + store volatile i32 999, i32 addrspace(1)* %out, align 4 br label %loop1 loop2: - store i32 888, i32 addrspace(1)* %out, align 4 + store volatile i32 888, i32 addrspace(1)* %out, align 4 br label %loop2 } @@ -113,7 +113,7 @@ ; IR: br label %inner_loop ; IR: inner_loop: -; IR: store i32 999, i32 addrspace(1)* %out, align 4 +; IR: store volatile i32 999, i32 addrspace(1)* %out, align 4 ; IR: %cond3 = icmp eq i32 %tmp, 3 ; IR: br i1 true, label %TransitionBlock, label %UnifiedReturnBlock @@ -132,7 +132,6 @@ ; SI: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %inner_loop ; SI: s_waitcnt expcnt(0) ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7 -; SI: v_cmp_ne_u32_e32 ; SI: s_waitcnt lgkmcnt(0) ; SI: buffer_store_dword [[REG]] @@ -156,7 +155,7 @@ br label %inner_loop inner_loop: ; preds = %LeafBlock, %LeafBlock1 - store i32 999, i32 addrspace(1)* %out, align 4 + store volatile i32 999, i32 addrspace(1)* %out, align 4 %cond3 = icmp eq i32 %tmp, 3 br i1 %cond3, label %inner_loop, label %outer_loop Index: test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- test/CodeGen/AMDGPU/multilevel-break.ll +++ test/CodeGen/AMDGPU/multilevel-break.ll @@ -36,7 +36,7 @@ ; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]] ; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]] -; GCN: ; %bb.{{[0-9]+}}: ; %Flow1{{$}} +; GCN: ; %bb.{{[0-9]+}}: ; %Flow2{{$}} ; GCN-NEXT: ; in Loop: Header=[[OUTER_LOOP]] Depth=1 ; Ensure copy is eliminated Index: test/CodeGen/AMDGPU/si-annotate-cf.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cf.ll +++ test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=verde -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s ; FUNC-LABEL: {{^}}break_inserted_outside_of_loop: @@ -89,17 +89,24 @@ ; This broke the old AMDIL cfg structurizer ; FUNC-LABEL: {{^}}loop_land_info_assert: -; SI: s_cmp_lt_i32 -; SI-NEXT: s_cbranch_scc1 [[ENDPGM:BB[0-9]+_[0-9]+]] - -; SI: s_cmpk_lt_i32 -; SI-NEXT: s_cbranch_scc0 [[ENDPGM]] - -; SI: [[INFLOOP:BB[0-9]+_[0-9]+]] -; SI: s_cbranch_vccnz [[INFLOOP]] - -; SI: [[ENDPGM]]: -; SI: s_endpgm +; SI: v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}} +; SI: s_and_b64 vcc, exec, [[CMP4]] +; SI-NEXT: s_cbranch_vccnz [[BR1:BB[0-9_]+]] +; SI-NEXT: s_branch [[BR2:BB[0-9_]+]] +; SI-NEXT: BB{{[0-9_]+}}: +; SI-NEXT: buffer_store_dword + +; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]: + +; SI: [[BR1]]: +; SI-NEXT: s_and_b64 vcc, exec, +; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]] +; SI: s_branch [[INFLOOP]] +; SI-NEXT: [[BR2]]: +; SI: s_cbranch_vccz [[ENDPGM]] + +; SI: [[ENDPGM]]: +; SI-NEXT: s_endpgm define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind { entry: %cmp = icmp sgt i32 %c0, 0 @@ -144,7 +151,6 @@ ret void } - declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -328,6 +328,7 @@ .inner_loop_body: %descriptor = load <4 x i32>, <4 x i32> addrspace(4)* %descptr, align 16, !invariant.load !0 %load1result = call float @llvm.SI.load.const.v4i32(<4 x i32> %descriptor, i32 0) + store float %load1result, float addrspace(1)* undef %inner_br2 = icmp uge i32 %1, 10 br i1 %inner_br2, label %.inner_loop_header, label %.outer_loop_body