Index: test/CodeGen/AMDGPU/cayman-loop-bug.ll =================================================================== --- test/CodeGen/AMDGPU/cayman-loop-bug.ll +++ test/CodeGen/AMDGPU/cayman-loop-bug.ll @@ -11,20 +11,26 @@ define amdgpu_ps void @main (<4 x float> inreg %reg0) { entry: br label %outer_loop + outer_loop: %cnt = phi i32 [0, %entry], [%cnt_incr, %inner_loop] %cond = icmp eq i32 %cnt, 16 br i1 %cond, label %outer_loop_body, label %exit + outer_loop_body: %cnt_incr = add i32 %cnt, 1 br label %inner_loop + inner_loop: %cnt2 = phi i32 [0, %outer_loop_body], [%cnt2_incr, %inner_loop_body] - %cond2 = icmp eq i32 %cnt2, 16 - br i1 %cond, label %inner_loop_body, label %outer_loop + %n = load volatile i32, i32 addrspace(1)* undef + %cond2 = icmp slt i32 %cnt2, %n + br i1 %cond2, label %inner_loop_body, label %outer_loop + inner_loop_body: %cnt2_incr = add i32 %cnt2, 1 br label %inner_loop + exit: ret void } Index: test/CodeGen/AMDGPU/commute-shifts.ll =================================================================== --- test/CodeGen/AMDGPU/commute-shifts.ll +++ test/CodeGen/AMDGPU/commute-shifts.ll @@ -4,9 +4,9 @@ ; GCN-LABEL: {{^}}main: ; SI: v_lshl_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 1 -define amdgpu_ps void @main() #0 { +define amdgpu_ps void @main(float %arg0, float %arg1) #0 { bb: - %tmp = fptosi float undef to i32 + %tmp = fptosi float %arg0 to i32 %tmp1 = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp2.f = extractelement <4 x float> %tmp1, i32 0 %tmp2 = bitcast float %tmp2.f to i32 @@ -14,7 +14,7 @@ %tmp4 = shl i32 1, %tmp3 %tmp5 = and i32 %tmp2, %tmp4 %tmp6 = icmp eq i32 %tmp5, 0 - %tmp7 = select i1 %tmp6, float 0.000000e+00, float undef + %tmp7 = select i1 %tmp6, float 0.000000e+00, float %arg1 %tmp8 = call i32 @llvm.SI.packf16(float undef, float %tmp7) %tmp9 = bitcast i32 %tmp8 to float call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp9, float undef, float %tmp9) Index: test/CodeGen/AMDGPU/i1-copy-phi.ll =================================================================== --- test/CodeGen/AMDGPU/i1-copy-phi.ll +++ test/CodeGen/AMDGPU/i1-copy-phi.ll @@ -24,7 +24,8 @@ br i1 %tmp, label %bb4, label %bb6 bb4: ; preds = %bb3 - %tmp5 = mul i32 undef, %arg + %val = load volatile i32, i32 addrspace(1)* undef + %tmp5 = mul i32 %val, %arg br label %bb6 bb6: ; preds = %bb4, %bb3 Index: test/CodeGen/AMDGPU/mubuf.ll =================================================================== --- test/CodeGen/AMDGPU/mubuf.ll +++ test/CodeGen/AMDGPU/mubuf.ll @@ -62,8 +62,7 @@ %tmp2 = shl i32 %6, 2 %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) %tmp4 = add i32 %6, 16 - %tmp5 = bitcast float 0.0 to i32 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) ret void } @@ -81,8 +80,7 @@ %tmp2 = shl i32 %6, 2 %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) %tmp4 = add i32 %6, 16 - %tmp5 = bitcast float 0.0 to i32 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) ret void } Index: test/CodeGen/AMDGPU/or.ll =================================================================== --- test/CodeGen/AMDGPU/or.ll +++ test/CodeGen/AMDGPU/or.ll @@ -96,7 +96,7 @@ ; SI: v_or_b32_e32 v{{[0-9]}} define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 - %loadb = load i64, i64 addrspace(1)* %a, align 8 + %loadb = load i64, i64 addrspace(1)* %b, align 8 %or = or i64 %loada, %loadb store i64 %or, i64 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/udivrem.ll =================================================================== --- test/CodeGen/AMDGPU/udivrem.ll +++ test/CodeGen/AMDGPU/udivrem.ll @@ -51,11 +51,11 @@ ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 ; SI: s_endpgm -define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) { +define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) { %result0 = udiv i32 %x, %y - store i32 %result0, i32 addrspace(1)* %out + store i32 %result0, i32 addrspace(1)* %out0 %result1 = urem i32 %x, %y - store i32 %result1, i32 addrspace(1)* %out + store i32 %result1, i32 addrspace(1)* %out1 ret void } Index: test/CodeGen/AMDGPU/uniform-crash.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-crash.ll +++ test/CodeGen/AMDGPU/uniform-crash.ll @@ -35,11 +35,12 @@ br label %bb3 bb3: ; preds = %bb3, %bb2 - %tmp4 = icmp eq i32 undef, %arg1 + %val = load volatile i32, i32 addrspace(2)* undef + %tmp4 = icmp eq i32 %val, %arg1 br i1 %tmp4, label %bb5, label %bb3 bb5: ; preds = %bb3, %bb - %tmp6 = tail call i32 @llvm.r600.read.tidig.y() #1 + %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #1 %tmp10 = icmp ult i32 %tmp6, %arg br i1 %tmp10, label %bb11, label %bb12 @@ -51,6 +52,6 @@ } ; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.y() #1 +declare i32 @llvm.amdgcn.workitem.id.y() #1 attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/vselect.ll =================================================================== --- test/CodeGen/AMDGPU/vselect.ll +++ test/CodeGen/AMDGPU/vselect.ll @@ -2,28 +2,28 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=VI --check-prefix=FUNC %s ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s -;FUNC-LABEL: {{^}}test_select_v2i32: +; FUNC-LABEL: {{^}}test_select_v2i32: -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z +; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 -define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { +define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) { entry: - %0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0 - %1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1 - %cmp = icmp ne <2 x i32> %0, %1 - %result = select <2 x i1> %cmp, <2 x i32> %0, <2 x i32> %1 + %load0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0 + %load1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1 + %cmp = icmp sgt <2 x i32> %load0, %load1 + %result = select <2 x i1> %cmp, <2 x i32> %val, <2 x i32> %load0 store <2 x i32> %result, <2 x i32> addrspace(1)* %out ret void } -;FUNC-LABEL: {{^}}test_select_v2f32: +; FUNC-LABEL: {{^}}test_select_v2f32: -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;SI: v_cndmask_b32_e64 ;SI: v_cndmask_b32_e32 @@ -40,24 +40,24 @@ ;FUNC-LABEL: {{^}}test_select_v4i32: -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[4].X +; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].W +; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z +; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y ; FIXME: The shrinking does not happen on tonga -;SI: v_cndmask_b32 -;SI: v_cndmask_b32 -;SI: v_cndmask_b32 -;SI: v_cndmask_b32 +; SI: v_cndmask_b32 +; SI: v_cndmask_b32 +; SI: v_cndmask_b32 +; SI: v_cndmask_b32 -define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { +define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) { entry: - %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0 - %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1 - %cmp = icmp ne <4 x i32> %0, %1 - %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1 + %load0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0 + %load1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1 + %cmp = icmp sgt <4 x i32> %load0, %load1 + %result = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %load0 store <4 x i32> %result, <4 x i32> addrspace(1)* %out ret void }