Index: test/CodeGen/AMDGPU/cf-loop-on-constant.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -0,0 +1,121 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
+
+; GCN-LABEL: {{^}}test_loop:
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: ds_read_b32
+; GCN: ds_write_b32
+; GCN: s_branch [[LABEL]]
+; GCN: s_endpgm
+define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
+entry:
+  %cmp = icmp eq i32 %n, -1
+  br i1 %cmp, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+
+for.body:
+  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %tmp = add i32 %indvar, 32
+  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %add = fadd float %vecload, 1.0
+  store float %add, float addrspace(3)* %arrayidx, align 8
+  %inc = add i32 %indvar, 1
+  br label %for.body
+}
+
+; GCN-LABEL: @loop_const_true
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: ds_read_b32
+; GCN: ds_write_b32
+; GCN: s_branch [[LABEL]]
+define void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
+entry:
+  br label %for.body
+
+for.exit:
+  ret void
+
+for.body:
+  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %tmp = add i32 %indvar, 32
+  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %add = fadd float %vecload, 1.0
+  store float %add, float addrspace(3)* %arrayidx, align 8
+  %inc = add i32 %indvar, 1
+  br i1 true, label %for.body, label %for.exit
+}
+
+; GCN-LABEL: {{^}}loop_const_false:
+; GCN-NOT: s_branch
+; GCN: s_endpgm
+define void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
+entry:
+  br label %for.body
+
+for.exit:
+  ret void
+
+; XXX - Should there be an S_ENDPGM?
+for.body:
+  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %tmp = add i32 %indvar, 32
+  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %add = fadd float %vecload, 1.0
+  store float %add, float addrspace(3)* %arrayidx, align 8
+  %inc = add i32 %indvar, 1
+  br i1 false, label %for.body, label %for.exit
+}
+
+; GCN-LABEL: {{^}}loop_const_undef:
+; GCN-NOT: s_branch
+; GCN: s_endpgm
+define void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
+entry:
+  br label %for.body
+
+for.exit:
+  ret void
+
+; XXX - Should there be an s_endpgm?
+for.body:
+  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %tmp = add i32 %indvar, 32
+  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %add = fadd float %vecload, 1.0
+  store float %add, float addrspace(3)* %arrayidx, align 8
+  %inc = add i32 %indvar, 1
+  br i1 undef, label %for.body, label %for.exit
+}
+
+; GCN-LABEL: {{^}}loop_arg_0:
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; GCN: v_cmp_eq_i32_e32 vcc, 1,
+
+; GCN: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, exec, vcc
+; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]
+; GCN: s_cbranch_vccnz [[LOOPBB]]
+; GCN-NEXT: ; BB#2
+; GCN-NEXT: s_endpgm
+define void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
+entry:
+  br label %for.body
+
+for.exit:
+  ret void
+
+for.body:
+  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %tmp = add i32 %indvar, 32
+  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %add = fadd float %vecload, 1.0
+  store float %add, float addrspace(3)* %arrayidx, align 8
+  %inc = add i32 %indvar, 1
+  br i1 %cond, label %for.body, label %for.exit
+}
Index: test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
@@ -0,0 +1,126 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}store_build_vector_multiple_uses_v4i32:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+                                                    <4 x i32> addrspace(1)* noalias %out1,
+                                                    i32 addrspace(1)* noalias %out2,
+                                                    i32 addrspace(1)* %in) {
+  %elt0 = load volatile i32, i32 addrspace(1)* %in
+  %elt1 = load volatile i32, i32 addrspace(1)* %in
+  %elt2 = load volatile i32, i32 addrspace(1)* %in
+  %elt3 = load volatile i32, i32 addrspace(1)* %in
+
+  %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
+
+  store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out0
+  store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out1
+
+  %extract0 = extractelement <4 x i32> %vec3, i32 0
+  %extract1 = extractelement <4 x i32> %vec3, i32 1
+  %extract2 = extractelement <4 x i32> %vec3, i32 2
+  %extract3 = extractelement <4 x i32> %vec3, i32 3
+
+  store volatile i32 %extract0, i32 addrspace(1)* %out2
+  store volatile i32 %extract1, i32 addrspace(1)* %out2
+  store volatile i32 %extract2, i32 addrspace(1)* %out2
+  store volatile i32 %extract3, i32 addrspace(1)* %out2
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_build_vector_multiple_extract_uses_v4i32:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+
+; GCN: buffer_store_dwordx4
+
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+                                                            <4 x i32> addrspace(1)* noalias %out1,
+                                                            i32 addrspace(1)* noalias %out2,
+                                                            i32 addrspace(1)* %in) {
+  %elt0 = load volatile i32, i32 addrspace(1)* %in
+  %elt1 = load volatile i32, i32 addrspace(1)* %in
+  %elt2 = load volatile i32, i32 addrspace(1)* %in
+  %elt3 = load volatile i32, i32 addrspace(1)* %in
+
+  %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
+
+  %extract0 = extractelement <4 x i32> %vec3, i32 0
+  %extract1 = extractelement <4 x i32> %vec3, i32 1
+  %extract2 = extractelement <4 x i32> %vec3, i32 2
+  %extract3 = extractelement <4 x i32> %vec3, i32 3
+
+  %op0 = add i32 %extract0, 3
+  %op1 = sub i32 %extract1, 9
+  %op2 = xor i32 %extract2, 1231412
+  %op3 = and i32 %extract3, 258233412312
+
+  store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out0
+
+  store volatile i32 %op0, i32 addrspace(1)* %out2
+  store volatile i32 %op1, i32 addrspace(1)* %out2
+  store volatile i32 %op2, i32 addrspace(1)* %out2
+  store volatile i32 %op3, i32 addrspace(1)* %out2
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+
+; GCN: buffer_store_dwordx4
+
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+define void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
+                                                                     <4 x i32> addrspace(1)* noalias %out1,
+                                                                     i64 addrspace(1)* noalias %out2,
+                                                                     i32 addrspace(1)* %in) {
+  %elt0 = load volatile i32, i32 addrspace(1)* %in
+  %elt1 = load volatile i32, i32 addrspace(1)* %in
+  %elt2 = load volatile i32, i32 addrspace(1)* %in
+  %elt3 = load volatile i32, i32 addrspace(1)* %in
+
+  %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
+
+  %bc.vec3 = bitcast <4 x i32> %vec3 to <2 x i64>
+  store <2 x i64> %bc.vec3, <2 x i64> addrspace(1)* %out0
+
+  %extract0 = extractelement <2 x i64> %bc.vec3, i32 0
+  %extract1 = extractelement <2 x i64> %bc.vec3, i32 1
+
+  store volatile i64 %extract0, i64 addrspace(1)* %out2
+  store volatile i64 %extract1, i64 addrspace(1)* %out2
+
+  ret void
+}
Index: test/CodeGen/AMDGPU/mad24-get-global-id.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; If the workgroup id range is restricted, we should be able to use
+; mad24 for the usual indexing pattern.
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+
+; GCN-LABEL: {{^}}get_global_id_0:
+; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
+; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]]
+; GCN: v_mad_u32_u24 v{{[0-9]+}}, [[VWGSIZEX]], s8, v0
+define void @get_global_id_0(i32 addrspace(1)* %out) #1 {
+  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
+  %gep = getelementptr inbounds i32, i32 addrspace(2)* %cast.dispatch.ptr, i64 1
+  %workgroup.size.xy = load i32, i32 addrspace(2)* %gep, align 4, !invariant.load !0
+  %workgroup.size.x = and i32 %workgroup.size.xy, 65535
+
+  %workitem.id.x = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
+  %workgroup.id.x = call i32 @llvm.amdgcn.workgroup.id.x(), !range !2
+
+  %mul = mul i32 %workgroup.id.x, %workgroup.size.x
+  %add = add i32 %mul, %workitem.id.x
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!0 = !{}
+!1 = !{i32 0, i32 1024}
+!2 = !{i32 0, i32 16777216}
Index: test/CodeGen/AMDGPU/no-shrink-extloads.ll
===================================================================
--- test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -201,3 +201,15 @@
   store i32 %mask, i32 addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32:
+; SI: buffer_load_dword v
+; SI: buffer_store_dword v
+define void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+  %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %bc = bitcast <2 x i32> %ld to i64
+  %hi = lshr i64 %bc, 32
+  %trunc = trunc i64 %hi to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
Index: test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll
===================================================================
--- test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; REQUIRES: asserts
-; XFAIL: *
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
-
-
-define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
-; CHECK-LABEL: {{^}}test:
-
-entry:
-  switch i32 %x, label %sw.default [
-    i32 0, label %sw.bb
-    i32 60, label %sw.bb
-  ]
-
-sw.bb:
-  unreachable
-
-sw.default:
-  unreachable
-
-sw.epilog:
-  ret void
-}
-
Index: test/CodeGen/AMDGPU/si-annotate-cf.ll
===================================================================
--- test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -64,6 +64,87 @@
   ret void
 }
 
+; FIXME: should emit s_endpgm
+; CHECK-LABEL: {{^}}switch_unreachable:
+; CHECK-NOT: s_endpgm
+; CHECK: .Lfunc_end2
+define void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+centry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 60, label %sw.bb
+  ]
+
+sw.bb:
+  unreachable
+
+sw.default:
+  unreachable
+
+sw.epilog:
+  ret void
+}
+
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; This broke the old AMDIL cfg structurizer
+; FUNC-LABEL: {{^}}loop_land_info_assert:
+; SI: s_cmp_gt_i32
+; SI-NEXT: s_cbranch_scc0 [[ENDPGM:BB[0-9]+_[0-9]+]]
+
+; SI: s_cmp_gt_i32
+; SI-NEXT: s_cbranch_scc1 [[ENDPGM]]
+
+; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]
+; SI: s_branch [[INFLOOP]]
+
+; SI: [[ENDPGM]]:
+; SI: s_endpgm
+define void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
+entry:
+  %cmp = icmp sgt i32 %c0, 0
+  br label %while.cond.outer
+
+while.cond.outer:
+  %tmp = load float, float addrspace(1)* undef
+  br label %while.cond
+
+while.cond:
+  %cmp1 = icmp slt i32 %c1, 4
+  br i1 %cmp1, label %convex.exit, label %for.cond
+
+convex.exit:
+  %or = or i1 %cmp, %cmp1
+  br i1 %or, label %return, label %if.end
+
+if.end:
+  %tmp3 = call float @llvm.fabs.f32(float %tmp) nounwind readnone
+  %cmp2 = fcmp olt float %tmp3, 0x3E80000000000000
+  br i1 %cmp2, label %if.else, label %while.cond.outer
+
+if.else:
+  store volatile i32 3, i32 addrspace(1)* undef, align 4
+  br label %while.cond
+
+for.cond:
+  %cmp3 = icmp slt i32 %c3, 1000
+  br i1 %cmp3, label %for.body, label %return
+
+for.body:
+  br i1 %cmp3, label %self.loop, label %if.end.2
+
+if.end.2:
+  %or.cond2 = or i1 %cmp3, %arg
+  br i1 %or.cond2, label %return, label %for.cond
+
+self.loop:
+ br label %self.loop
+
+return:
+  ret void
+}
+
+
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
 
 attributes #0 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
===================================================================
--- test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
+++ test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=kaveri  < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test:
 ; CHECK s_and_saveexec_b64
@@ -6,7 +6,7 @@
 ; CHECK s_or_b64 exec, exec
 ; CHECK s_andn2_b64 exec, exec
 ; CHECK s_cbranch_execnz
-define spir_kernel void @test(i32 %arg, i32 %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6) {
+define void @test(i32 %arg, i32 %arg1) {
 bb:
   %tmp = icmp ne i32 %arg, 0
   %tmp7 = icmp ne i32 %arg1, 0
Index: test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
===================================================================
--- test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
+++ test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
@@ -1,8 +1,6 @@
-; REQUIRES: asserts
-; XFAIL: *
-; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
 ; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
-; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s
+; XUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s
 
 ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
 
@@ -41,8 +39,9 @@
 }
 
 ; COMMON-LABEL: {{^}}branch_false:
-; SI: .text
-; SI-NEXT: s_endpgm
+; SI: s_cbranch_vccnz
+; SI: s_cbranch_vccnz
+; SI: s_endpgm
 define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 false, label %for.end, label %for.body.lr.ph
@@ -77,8 +76,9 @@
 }
 
 ; COMMON-LABEL: {{^}}branch_undef:
-; SI: .text
-; SI-NEXT: s_endpgm
+; SI: s_cbranch_vccnz
+; SI: s_cbranch_vccnz
+; SI: s_endpgm
 define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 undef, label %for.end, label %for.body.lr.ph