diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -168,11 +168,11 @@ ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: %tmp = load i32, i32 addrspace(4)* @external_constant - %ptr = load float*, float* addrspace(4)* @const.ptr %tmp1 = icmp ne i32 %tmp, 0 br i1 %tmp1, label %bb12, label %bb2 bb2: + %ptr = load float*, float* addrspace(4)* @const.ptr %tmp4 = load float, float* %ptr, align 4 %tmp5 = fcmp olt float %tmp4, 1.0 %tmp6 = or i1 %tmp5, false diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -1536,7 +1536,6 @@ ; GFX11_W64-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 @@ -1555,6 +1554,7 @@ exit: %cond = phi i1 [false, %entry], [%cmp1, %bb] + %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) store float %result, float addrspace(1)* %gep.out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -793,7 +793,6 @@ bb9: ; preds = %bb12, %bb %i10 = phi i64 [ %arg3, %bb ], [ %i13, %bb12 ] - %i11 = icmp slt i64 %i10, 0 br i1 undef, label %bb14, label %bb12 bb12: ; preds = %bb58, %bb9 @@ -801,6 +800,7 @@ br label %bb9 bb14: ; preds = %bb9 + %i11 = icmp slt i64 %i10, 0 %i15 = load i64, i64 addrspace(1)* null, align 8 br label %bb16 @@ -825,23 +825,23 @@ %i34 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 14 %i35 = bitcast half addrspace(1)* %i34 to <2 x half> addrspace(1)* %i36 = load volatile <2 x half>, <2 x half> addrspace(1)* %i35, align 4 + %i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8 + %i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32 + fence syncscope("workgroup") acquire + br i1 %i11, label %bb58, label %bb51 + +bb51: ; preds = %bb16 %i37 = fpext <2 x half> %arg4 to <2 x float> %i39 = fpext <2 x half> %i27 to <2 x float> %i40 = fpext <2 x half> %i30 to <2 x float> %i41 = fpext <2 x half> %i33 to <2 x float> %i42 = fpext <2 x half> %i36 to <2 x float> - %i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8 %i44 = fadd contract <2 x float> %i37, %i43 %i45 = fadd contract <2 x float> %i43, zeroinitializer - %i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32 %i47 = fadd contract <2 x float> %i39, %i46 %i48 = fadd contract <2 x float> %i40, %i43 %i49 = fadd contract <2 x float> %i41, zeroinitializer %i50 = fadd contract <2 x float> %i42, zeroinitializer - fence syncscope("workgroup") acquire - br i1 %i11, label %bb58, label %bb51 - -bb51: ; preds = %bb16 %i52 = fadd contract <2 x float> %i18, %i44 %i53 = fadd contract <2 x float> %i19, %i45 %i54 = fadd contract <2 x float> %i20, %i47 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -508,11 +508,11 @@ bb: %tmp = icmp slt i32 %arg2, 9 %tmp6 = icmp eq i32 %arg1, 0 - %tmp7 = icmp sgt i32 %arg4, 0 %tmp8 = icmp sgt i32 %arg4, 5 br i1 %tmp8, label %bb9, label %bb13 bb9: ; preds = %bb + %tmp7 = icmp sgt i32 %arg4, 0 %tmp10 = and i1 %tmp7, %tmp %tmp11 = icmp slt i32 %arg3, %arg4 %tmp12 = or i1 %tmp11, %tmp7 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll @@ -8,19 +8,19 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; OPT-LABEL: @test_sink_small_offset_global_atomic_csub_i32( ; OPT-NEXT: entry: -; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 999999 -; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3 +; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] ; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT: if: -; OPT-NEXT: [[TMP0:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to i8 addrspace(1)* -; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28 -; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to i32 addrspace(1)* -; OPT-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* [[TMP1]], i32 2) +; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i32 7 +; OPT-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* [[IN_GEP]], i32 2) ; OPT-NEXT: br label [[ENDIF]] ; OPT: endif: ; OPT-NEXT: [[X:%.*]] = phi i32 [ [[VAL]], [[IF]] ], [ 0, [[ENTRY:%.*]] ] +; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 999999 ; OPT-NEXT: store i32 [[X]], i32 addrspace(1)* [[OUT_GEP]], align 4 +; OPT-NEXT: br label [[DONE:%.*]] +; OPT: done: ; OPT-NEXT: ret void ; ; GCN-LABEL: test_sink_small_offset_global_atomic_csub_i32: @@ -43,18 +43,18 @@ ; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:252 ; GCN-NEXT: s_endpgm entry: - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999 - %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 7 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %cmp = icmp eq i32 %tid, 0 br i1 %cmp, label %endif, label %if if: + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 7 %val = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %in.gep, i32 2) br label %endif endif: %x = phi i32 [ %val, %if ], [ 0, %entry ] + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999 store i32 %x, i32 addrspace(1)* %out.gep br label %done diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -7,20 +7,20 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in) { ; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32( ; OPT-NEXT: entry: -; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999 -; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) [[ATTR3:#.*]] +; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] ; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT: if: -; OPT-NEXT: [[TMP0:%.*]] = bitcast float addrspace(1)* [[IN:%.*]] to i8 addrspace(1)* -; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28 -; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to float addrspace(1)* -; OPT-NEXT: [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00) +; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr float, float addrspace(1)* [[IN:%.*]], i32 7 +; OPT-NEXT: [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[IN_GEP]], float 2.000000e+00) ; OPT-NEXT: [[VAL:%.*]] = load volatile float, float addrspace(1)* undef, align 4 ; OPT-NEXT: br label [[ENDIF]] ; OPT: endif: ; OPT-NEXT: [[X:%.*]] = phi float [ [[VAL]], [[IF]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999 ; OPT-NEXT: store float [[X]], float addrspace(1)* [[OUT_GEP]], align 4 +; OPT-NEXT: br label [[DONE:%.*]] +; OPT: done: ; OPT-NEXT: ret void ; ; GCN-LABEL: test_sink_small_offset_global_atomic_fadd_f32: @@ -45,19 +45,19 @@ ; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:2300 ; GCN-NEXT: s_endpgm entry: - %out.gep = getelementptr float, float addrspace(1)* %out, i32 999999 - %in.gep = getelementptr float, float addrspace(1)* %in, i32 7 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %cmp = icmp eq i32 %tid, 0 br i1 %cmp, label %endif, label %if if: + %in.gep = getelementptr float, float addrspace(1)* %in, i32 7 %fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %in.gep, float 2.0) %val = load volatile float, float addrspace(1)* undef br label %endif endif: %x = phi float [ %val, %if ], [ 0.0, %entry ] + %out.gep = getelementptr float, float addrspace(1)* %out, i32 999999 store float %x, float addrspace(1)* %out.gep br label %done diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -11,13 +11,13 @@ define protected amdgpu_kernel void @_Z11test_kernelPii(i32 addrspace(1)* nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 { entry: - %rem.lhs.trunc = trunc i32 %s to i16 - %rem4 = urem i16 %rem.lhs.trunc, 12 - %rem.zext = zext i16 %rem4 to i32 %cmp = icmp eq i32 %s, 3 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry + %rem.lhs.trunc = trunc i32 %s to i16 + %rem4 = urem i16 %rem.lhs.trunc, 12 + %rem.zext = zext i16 %rem4 to i32 %idxprom = zext i32 %s to i64 %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %Ad.coerce, i64 %idxprom %div = lshr i32 %rem.zext, 3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -254,12 +254,6 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ; return to shader part epilog main_body: - %c.bc = bitcast i32 %c to float - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 - %tex0 = extractelement <4 x float> %tex, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 - %data.sample = extractelement <4 x float> %dtex, i32 0 - %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %IF, label %ELSE @@ -271,6 +265,12 @@ br label %END ELSE: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %data.sample = extractelement <4 x float> %dtex, i32 0 + call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) br label %END diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -10,36 +10,37 @@ ; OPT-NEXT: main_body: ; OPT-NEXT: br label [[LOOP_OUTER:%.*]] ; OPT: LOOP.outer: -; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP9:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ] +; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP10:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ] ; OPT-NEXT: [[TMP43:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP4:%.*]], [[FLOW1]] ] ; OPT-NEXT: br label [[LOOP:%.*]] ; OPT: LOOP: -; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP7:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ] +; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP8:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ] ; OPT-NEXT: [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP4]], [[FLOW]] ] -; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP47:%.*]], [[FLOW]] ] -; OPT-NEXT: [[TMP47]] = add i32 [[TMP45]], 1 +; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP5:%.*]], [[FLOW]] ] ; OPT-NEXT: [[TMP48:%.*]] = icmp slt i32 [[TMP45]], [[UB:%.*]] ; OPT-NEXT: [[TMP1:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]]) ; OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP1]], 0 ; OPT-NEXT: [[TMP3:%.*]] = extractvalue { i1, i64 } [[TMP1]], 1 ; OPT-NEXT: br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]] ; OPT: Flow: -; OPT-NEXT: [[TMP4]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ] -; OPT-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] -; OPT-NEXT: [[TMP6:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] +; OPT-NEXT: [[TMP4]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ] +; OPT-NEXT: [[TMP5]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ undef, [[LOOP]] ] +; OPT-NEXT: [[TMP6:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] +; OPT-NEXT: [[TMP7:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP3]]) -; OPT-NEXT: [[TMP7]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP6]], i64 [[PHI_BROKEN]]) -; OPT-NEXT: [[TMP8:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP7]]) -; OPT-NEXT: [[TMP9]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP5]], i64 [[PHI_BROKEN2]]) -; OPT-NEXT: br i1 [[TMP8]], label [[FLOW1]], label [[LOOP]] +; OPT-NEXT: [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP7]], i64 [[PHI_BROKEN]]) +; OPT-NEXT: [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]]) +; OPT-NEXT: [[TMP10]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP6]], i64 [[PHI_BROKEN2]]) +; OPT-NEXT: br i1 [[TMP9]], label [[FLOW1]], label [[LOOP]] ; OPT: Flow1: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]]) -; OPT-NEXT: [[TMP10:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP9]]) -; OPT-NEXT: br i1 [[TMP10]], label [[IF:%.*]], label [[LOOP_OUTER]] +; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) +; OPT-NEXT: [[TMP11:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP10]]) +; OPT-NEXT: br i1 [[TMP11]], label [[IF:%.*]], label [[LOOP_OUTER]] ; OPT: IF: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP9]]) +; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP10]]) ; OPT-NEXT: ret void ; OPT: ENDIF: +; OPT-NEXT: [[TMP47]] = add i32 [[TMP45]], 1 ; OPT-NEXT: [[TMP51]] = icmp eq i32 [[TMP47]], [[CONT:%.*]] ; OPT-NEXT: [[TMP51_INV]] = xor i1 [[TMP51]], true ; OPT-NEXT: br label [[FLOW]] @@ -98,7 +99,6 @@ LOOP: ; preds = %ENDIF, %LOOP.outer %tmp45 = phi i32 [ %tmp43, %LOOP.outer ], [ %tmp47, %ENDIF ] - %tmp47 = add i32 %tmp45, 1 %tmp48 = icmp slt i32 %tmp45, %ub br i1 %tmp48, label %ENDIF, label %IF @@ -106,6 +106,7 @@ ret void ENDIF: ; preds = %LOOP + %tmp47 = add i32 %tmp45, 1 %tmp51 = icmp eq i32 %tmp47, %cont br i1 %tmp51, label %LOOP, label %LOOP.outer } diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -190,16 +190,16 @@ ; GCN-NEXT: s_endpgm ; IR-LABEL: @nested_loop_conditions( ; IR-NEXT: bb: +; IR-NEXT: [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef +; IR-NEXT: [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9 +; IR-NEXT: br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]] +; IR: bb14.lr.ph: ; IR-NEXT: [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #4 ; IR-NEXT: [[MY_TMP1:%.*]] = zext i32 [[MY_TMP]] to i64 ; IR-NEXT: [[MY_TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[ARG:%.*]], i64 [[MY_TMP1]] ; IR-NEXT: [[MY_TMP3:%.*]] = load i64, i64 addrspace(1)* [[MY_TMP2]], align 16 ; IR-NEXT: [[MY_TMP932:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16 ; IR-NEXT: [[MY_TMP1033:%.*]] = extractelement <4 x i32> [[MY_TMP932]], i64 0 -; IR-NEXT: [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef -; IR-NEXT: [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9 -; IR-NEXT: br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]] -; IR: bb14.lr.ph: ; IR-NEXT: br label [[BB14:%.*]] ; IR: Flow3: ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP21:%.*]]) @@ -277,17 +277,17 @@ ; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef ; IR-NEXT: ret void bb: + %my.tmp1134 = load volatile i32, i32 addrspace(1)* undef + %my.tmp1235 = icmp slt i32 %my.tmp1134, 9 + br i1 %my.tmp1235, label %bb14.lr.ph, label %bb13 + +bb14.lr.ph: ; preds = %bb %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %my.tmp1 = zext i32 %my.tmp to i64 %my.tmp2 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %my.tmp1 %my.tmp3 = load i64, i64 addrspace(1)* %my.tmp2, align 16 %my.tmp932 = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16 %my.tmp1033 = extractelement <4 x i32> %my.tmp932, i64 0 - %my.tmp1134 = load volatile i32, i32 addrspace(1)* undef - %my.tmp1235 = icmp slt i32 %my.tmp1134, 9 - br i1 %my.tmp1235, label %bb14.lr.ph, label %bb13 - -bb14.lr.ph: ; preds = %bb br label %bb14 bb4.bb13_crit_edge: ; preds = %bb21 diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll --- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll @@ -21,7 +21,8 @@ ; GFX10-NEXT: s_or_b32 s1, s0, s1 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_cbranch_execz .LBB0_4 -; GFX10-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB0_2: ; %bb +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_or_b32 s2, s2, exec_lo ; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB0_1 @@ -50,20 +51,20 @@ ; GFX10-NEXT: s_inst_prefetch 0x2 ; GFX10-NEXT: s_endpgm branch1_true: - br label %2 + br label %bb -2: ; preds = %branch2_merge, %branch1_true +bb: ; preds = %branch2_merge, %branch1_true %r1.8.vec.insert14.i1 = phi float [ 0.000000e+00, %branch1_true ], [ %0, %branch2_merge ] - %3 = call float @llvm.amdgcn.image.sample.lz.3d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, float %r1.8.vec.insert14.i1, <8 x i32> zeroinitializer, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0) - %4 = icmp eq i32 %1, 0 - br i1 %4, label %loop0_merge, label %branch2_merge + %i = icmp eq i32 %1, 0 + br i1 %i, label %loop0_merge, label %branch2_merge -branch2_merge: ; preds = %2 - %5 = call reassoc nnan nsz arcp contract afn float @llvm.fma.f32(float %3, float %0, float 0.000000e+00) - %6 = fcmp ult float %5, 0.000000e+00 - br i1 %6, label %2, label %loop0_merge +branch2_merge: ; preds = %bb + %i2 = call float @llvm.amdgcn.image.sample.lz.3d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, float %r1.8.vec.insert14.i1, <8 x i32> zeroinitializer, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0) + %i3 = call reassoc nnan nsz arcp contract afn float @llvm.fma.f32(float %i2, float %0, float 0.000000e+00) + %i4 = fcmp ult float %i3, 0.000000e+00 + br i1 %i4, label %bb, label %loop0_merge -loop0_merge: ; preds = %branch2_merge, %2 +loop0_merge: ; preds = %branch2_merge, %bb ret void } diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -36,7 +36,6 @@ .entry: %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0 %.not10002 = icmp eq i32 %LocalInvocationId.i0, 0 - %i530 = icmp ult i32 %LocalInvocationId.i0, 4 br i1 %.not10002, label %.merge, label %.bb0 .bb0: @@ -44,6 +43,7 @@ .merge: %src = phi i32 [ 0, %.entry ], [ 1, %.bb0 ] + %i530 = icmp ult i32 %LocalInvocationId.i0, 4 br i1 %i530, label %.end, label %.then .then: @@ -103,7 +103,6 @@ .entry: %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0 %.not10002 = icmp eq i32 %LocalInvocationId.i0, 0 - %i530 = icmp ult i32 %LocalInvocationId.i0, 4 br i1 %.not10002, label %.merge, label %.bb0 .bb0: @@ -111,6 +110,7 @@ .merge: %src = phi i32 [ 0, %.entry ], [ 1, %.bb0 ] + %i530 = icmp ult i32 %LocalInvocationId.i0, 4 br i1 %i530, label %.then, label %.else .then: diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1374,10 +1374,10 @@ ; GFX11-NEXT: s_endpgm bb: %tmp = fcmp ult float %arg1, 0.000000e+00 - %tmp2 = fcmp ult float %arg, 0x3FCF5C2900000000 br i1 %tmp, label %bb6, label %bb3 bb3: ; preds = %bb + %tmp2 = fcmp ult float %arg, 0x3FCF5C2900000000 br i1 %tmp2, label %bb5, label %bb4 bb4: ; preds = %bb3 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -260,10 +260,10 @@ bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %cmp0 = icmp sgt i32 %cond0, 0 - %cmp1 = icmp sgt i32 %cond1, 0 br i1 %cmp0, label %bb2, label %bb9 bb2: ; preds = %bb + %cmp1 = icmp sgt i32 %cond1, 0 %tmp2 = sext i1 %cmp1 to i32 %tmp3 = add i32 %tmp2, %tmp br i1 %cmp1, label %bb9, label %bb7 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -2372,14 +2372,14 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %c.bc = bitcast i32 %c to float - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 - %tex0 = extractelement <4 x float> %tex, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %IF, label %ENDIF IF: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %dataf = extractelement <4 x float> %dtex, i32 0 %data1 = fptosi float %dataf to i32 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) @@ -2909,14 +2909,14 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %c.bc = bitcast i32 %c to float - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 - %tex0 = extractelement <4 x float> %tex, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %IF, label %ENDIF IF: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %dataf = extractelement <4 x float> %dtex, i32 0 %data1 = fptosi float %dataf to i32 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) @@ -2992,14 +2992,14 @@ ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: - %c.bc = bitcast i32 %c to float - %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 - %tex0 = extractelement <4 x float> %tex, i32 0 - %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %IF, label %ENDIF IF: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %dataf = extractelement <4 x float> %dtex, i32 0 %data1 = fptosi float %dataf to i32 %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)