Index: test/Analysis/CostModel/AMDGPU/add-sub.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/add-sub.ll +++ test/Analysis/CostModel/AMDGPU/add-sub.ll @@ -3,7 +3,7 @@ ; CHECK: 'add_i32' ; CHECK: estimated cost of 1 for {{.*}} add i32 -define void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { +define amdgpu_kernel void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr %add = add i32 %vec, %b store i32 %add, i32 addrspace(1)* %out @@ -12,7 +12,7 @@ ; CHECK: 'add_v2i32' ; CHECK: estimated cost of 2 for {{.*}} add <2 x i32> -define void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 { +define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 { %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr %add = add <2 x i32> %vec, %b store <2 x i32> %add, <2 x i32> addrspace(1)* %out @@ -21,7 +21,7 @@ ; CHECK: 'add_v3i32' ; CHECK: estimated cost of 3 for {{.*}} add <3 x i32> -define void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 { +define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 { %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr %add = add <3 x i32> %vec, %b store <3 x i32> %add, <3 x i32> addrspace(1)* %out @@ -30,7 +30,7 @@ ; CHECK: 'add_v4i32' ; CHECK: estimated cost of 4 for {{.*}} add <4 x i32> -define void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 { +define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 { %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr %add = add <4 x i32> %vec, %b store <4 x i32> %add, <4 x i32> addrspace(1)* %out @@ -39,7 +39,7 @@ ; CHECK: 'add_i64' ; CHECK: estimated cost of 2 for {{.*}} add i64 -define void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { +define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %add = add i64 %vec, %b store i64 %add, i64 addrspace(1)* %out @@ -48,7 +48,7 @@ ; CHECK: 'add_v2i64' ; CHECK: estimated cost of 4 for {{.*}} add <2 x i64> -define void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 { +define amdgpu_kernel void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 { %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr %add = add <2 x i64> %vec, %b store <2 x i64> %add, <2 x i64> addrspace(1)* %out @@ -57,7 +57,7 @@ ; CHECK: 'add_v3i64' ; CHECK: estimated cost of 6 for {{.*}} add <3 x i64> -define void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 { +define amdgpu_kernel void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 { %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr %add = add <3 x i64> %vec, %b store <3 x i64> %add, <3 x i64> addrspace(1)* %out @@ -66,7 +66,7 @@ ; CHECK: 'add_v4i64' ; CHECK: estimated cost of 8 for {{.*}} add <4 x i64> -define void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 { +define amdgpu_kernel void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 { %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr %add = add <4 x i64> %vec, %b store <4 x i64> %add, <4 x i64> addrspace(1)* %out @@ -75,7 +75,7 @@ ; CHECK: 'add_v16i64' ; CHECK: estimated cost of 32 for {{.*}} add <16 x i64> -define void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 { +define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 { %vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr %add = add <16 x i64> %vec, %b store <16 x i64> %add, <16 x i64> addrspace(1)* %out @@ -84,7 +84,7 @@ ; CHECK: 'add_i16' ; CHECK: estimated cost of 1 for {{.*}} add i16 -define void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { +define amdgpu_kernel void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { %vec = load i16, i16 addrspace(1)* %vaddr %add = add i16 %vec, %b store i16 %add, i16 addrspace(1)* %out @@ -93,7 +93,7 @@ ; CHECK: 'add_v2i16' ; CHECK: estimated cost of 2 for {{.*}} add <2 x i16> -define void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { +define amdgpu_kernel void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr %add = add <2 x i16> %vec, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out @@ -102,7 +102,7 @@ ; CHECK: 'sub_i32' ; CHECK: estimated cost of 1 for {{.*}} sub i32 -define void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { +define amdgpu_kernel void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr %sub = sub i32 %vec, %b store i32 %sub, i32 addrspace(1)* %out @@ -111,7 +111,7 @@ ; CHECK: 'sub_i64' ; CHECK: estimated cost of 2 for {{.*}} sub i64 -define void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { +define amdgpu_kernel void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %sub = sub i64 %vec, %b store i64 %sub, i64 addrspace(1)* %out @@ -119,7 +119,7 @@ } ; CHECK: 'sub_i16' ; CHECK: estimated cost of 1 for {{.*}} sub i16 -define void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { +define amdgpu_kernel void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { %vec = load i16, i16 addrspace(1)* %vaddr %sub = sub i16 %vec, %b store i16 %sub, i16 addrspace(1)* %out @@ -128,7 +128,7 @@ ; CHECK: 'sub_v2i16' ; CHECK: estimated cost of 2 for {{.*}} sub <2 x i16> -define void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { +define amdgpu_kernel void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr %sub = sub <2 x i16> %vec, %b store <2 x i16> %sub, <2 x i16> addrspace(1)* %out Index: test/Analysis/CostModel/AMDGPU/bit-ops.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/bit-ops.ll +++ test/Analysis/CostModel/AMDGPU/bit-ops.ll @@ -2,7 +2,7 @@ ; CHECK: 'or_i32' ; CHECK: estimated cost of 1 for {{.*}} or i32 -define void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { +define amdgpu_kernel void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr %or = or i32 %vec, %b store i32 %or, i32 addrspace(1)* %out @@ -11,7 +11,7 @@ ; CHECK: 'or_i64' ; CHECK: estimated cost of 2 for {{.*}} or i64 -define void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { +define amdgpu_kernel void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %or = or i64 %vec, %b store i64 %or, i64 addrspace(1)* %out @@ -20,7 +20,7 @@ ; CHECK: 'xor_i32' ; CHECK: estimated cost of 1 for {{.*}} xor i32 -define void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { +define amdgpu_kernel void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr %or = xor i32 %vec, %b store i32 %or, i32 addrspace(1)* %out @@ -29,7 +29,7 @@ ; CHECK: 'xor_i64' ; CHECK: estimated cost of 2 for {{.*}} xor i64 -define void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { +define amdgpu_kernel void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %or = xor i64 %vec, %b store i64 %or, i64 addrspace(1)* %out @@ -39,7 +39,7 @@ ; CHECK: 'and_i32' ; CHECK: estimated cost of 1 for {{.*}} and i32 -define void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { +define amdgpu_kernel void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr %or = and i32 %vec, %b store i32 %or, i32 addrspace(1)* %out @@ -48,7 +48,7 @@ ; CHECK: 'and_i64' ; CHECK: estimated cost of 2 for {{.*}} and i64 -define void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { +define amdgpu_kernel void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %or = and i64 %vec, %b store i64 %or, i64 addrspace(1)* %out Index: test/Analysis/CostModel/AMDGPU/br.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/br.ll +++ test/Analysis/CostModel/AMDGPU/br.ll @@ -4,7 +4,7 @@ ; CHECK: estimated cost of 10 for instruction: br i1 ; CHECK: estimated cost of 10 for instruction: br label ; CHECK: estimated cost of 10 for instruction: ret void -define void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { +define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { bb0: br i1 undef, label %bb1, label %bb2 @@ -21,7 +21,7 @@ ; CHECK: 'test_switch_cost' ; CHECK: Unknown cost for instruction: switch -define void @test_switch_cost(i32 %a) #0 { +define amdgpu_kernel void @test_switch_cost(i32 %a) #0 { entry: switch i32 %a, label %default [ i32 0, label %case0 Index: test/Analysis/CostModel/AMDGPU/extractelement.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/extractelement.ll +++ test/Analysis/CostModel/AMDGPU/extractelement.ll @@ -2,7 +2,7 @@ ; CHECK: 'extractelement_v2i32' ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32> -define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) { +define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) { %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr %elt = extractelement <2 x i32> %vec, i32 1 store i32 %elt, i32 addrspace(1)* %out @@ -11,7 +11,7 @@ ; CHECK: 'extractelement_v2f32' ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float> -define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) { +define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %elt = extractelement <2 x float> %vec, i32 1 store float %elt, float addrspace(1)* %out @@ -20,7 +20,7 @@ ; CHECK: 'extractelement_v3i32' ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32> -define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) { +define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) { %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr %elt = extractelement <3 x i32> %vec, i32 1 store i32 %elt, i32 addrspace(1)* %out @@ -29,7 +29,7 @@ ; CHECK: 'extractelement_v4i32' ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32> -define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) { +define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) { %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr %elt = extractelement <4 x i32> %vec, i32 1 store i32 %elt, i32 addrspace(1)* %out @@ -38,7 +38,7 @@ ; CHECK: 'extractelement_v8i32' ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32> -define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) { +define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) { %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr %elt = extractelement <8 x i32> %vec, i32 1 store i32 %elt, i32 addrspace(1)* %out @@ -48,7 +48,7 @@ ; FIXME: Should be non-0 ; CHECK: 'extractelement_v8i32_dynindex' ; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32> -define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) { +define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) { %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr %elt = extractelement <8 x i32> %vec, i32 %idx store i32 %elt, i32 addrspace(1)* %out @@ -57,7 +57,7 @@ ; CHECK: 'extractelement_v2i64' ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64> -define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) { +define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) { %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr %elt = extractelement <2 x i64> %vec, i64 1 store i64 %elt, i64 addrspace(1)* %out @@ -66,7 +66,7 @@ ; CHECK: 'extractelement_v3i64' ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64> -define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) { +define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) { %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr %elt = extractelement <3 x i64> %vec, i64 1 store i64 %elt, i64 addrspace(1)* %out @@ -75,7 +75,7 @@ ; CHECK: 'extractelement_v4i64' ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64> -define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) { +define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) { %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr %elt = extractelement <4 x i64> %vec, i64 1 store i64 %elt, i64 addrspace(1)* %out @@ -84,7 +84,7 @@ ; CHECK: 'extractelement_v8i64' ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64> -define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) { +define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) { %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr %elt = extractelement <8 x i64> %vec, i64 1 store i64 %elt, i64 addrspace(1)* %out @@ -93,7 +93,7 @@ ; CHECK: 'extractelement_v4i8' ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8> -define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) { +define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) { %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr %elt = extractelement <4 x i8> %vec, i8 1 store i8 %elt, i8 addrspace(1)* %out @@ -102,7 +102,7 @@ ; CHECK: 'extractelement_v2i16' ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16> -define void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { +define amdgpu_kernel void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr %elt = extractelement <2 x i16> %vec, i16 1 store i16 %elt, i16 addrspace(1)* %out Index: test/Analysis/CostModel/AMDGPU/fabs.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/fabs.ll +++ test/Analysis/CostModel/AMDGPU/fabs.ll @@ -2,7 +2,7 @@ ; CHECK: 'fabs_f32' ; CHECK: estimated cost of 0 for {{.*}} call float @llvm.fabs.f32 -define void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 { +define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 { %vec = load float, float addrspace(1)* %vaddr %fabs = call float @llvm.fabs.f32(float %vec) #1 store float %fabs, float addrspace(1)* %out @@ -11,7 +11,7 @@ ; CHECK: 'fabs_v2f32' ; CHECK: estimated cost of 0 for {{.*}} call <2 x float> @llvm.fabs.v2f32 -define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 { +define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %vec) #1 store <2 x float> %fabs, <2 x float> addrspace(1)* %out @@ -20,7 +20,7 @@ ; CHECK: 'fabs_v3f32' ; CHECK: estimated cost of 0 for {{.*}} call <3 x float> @llvm.fabs.v3f32 -define void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 { +define amdgpu_kernel void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %vec) #1 store <3 x float> %fabs, <3 x float> addrspace(1)* %out @@ -29,7 +29,7 @@ ; CHECK: 'fabs_f64' ; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64 -define void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { +define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { %vec = load double, double addrspace(1)* %vaddr %fabs = call double @llvm.fabs.f64(double %vec) #1 store double %fabs, double addrspace(1)* %out @@ -38,7 +38,7 @@ ; CHECK: 'fabs_v2f64' ; CHECK: estimated cost of 0 for {{.*}} call <2 x double> @llvm.fabs.v2f64 -define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 { +define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %vec) #1 store <2 x double> %fabs, <2 x double> addrspace(1)* %out @@ -47,7 +47,7 @@ ; CHECK: 'fabs_v3f64' ; CHECK: estimated cost of 0 for {{.*}} call <3 x double> @llvm.fabs.v3f64 -define void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 { +define amdgpu_kernel void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %fabs = call <3 x double> @llvm.fabs.v3f64(<3 x double> %vec) #1 store <3 x double> %fabs, <3 x double> addrspace(1)* %out @@ -56,7 +56,7 @@ ; CHECK: 'fabs_f16' ; CHECK: estimated cost of 0 for {{.*}} call half @llvm.fabs.f16 -define void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 { +define amdgpu_kernel void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 { %vec = load half, half addrspace(1)* %vaddr %fabs = call half @llvm.fabs.f16(half %vec) #1 store half %fabs, half addrspace(1)* %out @@ -65,7 +65,7 @@ ; CHECK: 'fabs_v2f16' ; CHECK: estimated cost of 0 for {{.*}} call <2 x half> @llvm.fabs.v2f16 -define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 { +define amdgpu_kernel void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vec) #1 store <2 x half> %fabs, <2 x half> addrspace(1)* %out @@ -74,7 +74,7 @@ ; CHECK: 'fabs_v3f16' ; CHECK: estimated cost of 0 for {{.*}} call <3 x half> @llvm.fabs.v3f16 -define void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 { +define amdgpu_kernel void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 { %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr %fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %vec) #1 store <3 x half> %fabs, <3 x half> addrspace(1)* %out Index: test/Analysis/CostModel/AMDGPU/fadd.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/fadd.ll +++ test/Analysis/CostModel/AMDGPU/fadd.ll @@ -3,7 +3,7 @@ ; ALL: 'fadd_f32' ; ALL: estimated cost of 1 for {{.*}} fadd float -define void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { +define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { %vec = load float, float addrspace(1)* %vaddr %add = fadd float %vec, %b store float %add, float addrspace(1)* %out @@ -12,7 +12,7 @@ ; ALL: 'fadd_v2f32' ; ALL: estimated cost of 2 for {{.*}} fadd <2 x float> -define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { +define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fadd <2 x float> %vec, %b store <2 x float> %add, <2 x float> addrspace(1)* %out @@ -21,7 +21,7 @@ ; ALL: 'fadd_v3f32' ; ALL: estimated cost of 3 for {{.*}} fadd <3 x float> -define void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { +define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fadd <3 x float> %vec, %b store <3 x float> %add, <3 x float> addrspace(1)* %out @@ -31,7 +31,7 @@ ; ALL: 'fadd_f64' ; FASTF64: estimated cost of 2 for {{.*}} fadd double ; SLOWF64: estimated cost of 3 for {{.*}} fadd double -define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { +define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fadd double %vec, %b store double %add, double addrspace(1)* %out @@ -41,7 +41,7 @@ ; ALL: 'fadd_v2f64' ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double> ; SLOWF64: estimated cost of 6 for {{.*}} fadd <2 x double> -define void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { +define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %add = fadd <2 x double> %vec, %b store <2 x double> %add, <2 x double> addrspace(1)* %out @@ -51,7 +51,7 @@ ; ALL: 'fadd_v3f64' ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double> ; SLOWF64: estimated cost of 9 for {{.*}} fadd <3 x double> -define void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { +define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %add = fadd <3 x double> %vec, %b store <3 x double> %add, <3 x double> addrspace(1)* %out @@ -60,7 +60,7 @@ ; ALL 'fadd_f16' ; ALL estimated cost of 1 for {{.*}} fadd half -define void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { +define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { %vec = load half, half addrspace(1)* %vaddr %add = fadd half %vec, %b store half %add, half addrspace(1)* %out @@ -69,7 +69,7 @@ ; ALL 'fadd_v2f16' ; ALL estimated cost of 2 for {{.*}} fadd <2 x half> -define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { +define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %add = fadd <2 x half> %vec, %b store <2 x half> %add, <2 x half> addrspace(1)* %out @@ -78,7 +78,7 @@ ; ALL 'fadd_v4f16' ; ALL estimated cost of 4 for {{.*}} fadd <4 x half> -define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { +define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr %add = fadd <4 x half> %vec, %b store <4 x half> %add, <4 x half> addrspace(1)* %out Index: test/Analysis/CostModel/AMDGPU/fdiv.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/fdiv.ll +++ test/Analysis/CostModel/AMDGPU/fdiv.ll @@ -5,7 +5,7 @@ ; CHECK: 'fdiv_f32' ; ALL: estimated cost of 10 for {{.*}} fdiv float -define void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { +define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { %vec = load float, float addrspace(1)* %vaddr %add = fdiv float %vec, %b store float %add, float addrspace(1)* %out @@ -14,7 +14,7 @@ ; ALL: 'fdiv_v2f32' ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x float> -define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { +define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fdiv <2 x float> %vec, %b store <2 x float> %add, <2 x float> addrspace(1)* %out @@ -23,7 +23,7 @@ ; ALL: 'fdiv_v3f32' ; ALL: estimated cost of 30 for {{.*}} fdiv <3 x float> -define void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { +define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fdiv <3 x float> %vec, %b store <3 x float> %add, <3 x float> addrspace(1)* %out @@ -35,7 +35,7 @@ ; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double ; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double ; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double -define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { +define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fdiv double %vec, %b store double %add, double addrspace(1)* %out @@ -47,7 +47,7 @@ ; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double> ; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double> ; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double> -define void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { +define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %add = fdiv <2 x double> %vec, %b store <2 x double> %add, <2 x double> addrspace(1)* %out @@ -59,7 +59,7 @@ ; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double> ; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double> ; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double> -define void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { +define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %add = fdiv <3 x double> %vec, %b store <3 x double> %add, <3 x double> addrspace(1)* %out @@ -68,7 +68,7 @@ ; ALL: 'fdiv_f16' ; ALL: estimated cost of 10 for {{.*}} fdiv half -define void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { +define amdgpu_kernel void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { %vec = load half, half addrspace(1)* %vaddr %add = fdiv half %vec, %b store half %add, half addrspace(1)* %out @@ -77,7 +77,7 @@ ; ALL: 'fdiv_v2f16' ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x half> -define void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { +define amdgpu_kernel void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %add = fdiv <2 x half> %vec, %b store <2 x half> %add, <2 x half> addrspace(1)* %out @@ -86,7 +86,7 @@ ; ALL: 'fdiv_v4f16' ; ALL: estimated cost of 40 for {{.*}} fdiv <4 x half> -define void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { +define amdgpu_kernel void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr %add = fdiv <4 x half> %vec, %b store <4 x half> %add, <4 x half> addrspace(1)* %out Index: test/Analysis/CostModel/AMDGPU/fmul.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/fmul.ll +++ test/Analysis/CostModel/AMDGPU/fmul.ll @@ -3,7 +3,7 @@ ; ALL: 'fmul_f32' ; ALL: estimated cost of 1 for {{.*}} fmul float -define void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { +define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { %vec = load float, float addrspace(1)* %vaddr %add = fmul float %vec, %b store float %add, float addrspace(1)* %out @@ -12,7 +12,7 @@ ; ALL: 'fmul_v2f32' ; ALL: estimated cost of 2 for {{.*}} fmul <2 x float> -define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { +define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fmul <2 x float> %vec, %b store <2 x float> %add, <2 x float> addrspace(1)* %out @@ -21,7 +21,7 @@ ; ALL: 'fmul_v3f32' ; ALL: estimated cost of 3 for {{.*}} fmul <3 x float> -define void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { +define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fmul <3 x float> %vec, %b store <3 x float> %add, <3 x float> addrspace(1)* %out @@ -31,7 +31,7 @@ ; ALL: 'fmul_f64' ; FASTF64: estimated cost of 2 for {{.*}} fmul double ; SLOWF64: estimated cost of 3 for {{.*}} fmul double -define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { +define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fmul double %vec, %b store double %add, double addrspace(1)* %out @@ -41,7 +41,7 @@ ; ALL: 'fmul_v2f64' ; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double> ; SLOWF64: estimated cost of 6 for {{.*}} fmul <2 x double> -define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { +define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %add = fmul <2 x double> %vec, %b store <2 x double> %add, <2 x double> addrspace(1)* %out @@ -51,7 +51,7 @@ ; ALL: 'fmul_v3f64' ; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double> ; SLOWF64: estimated cost of 9 for {{.*}} fmul <3 x double> -define void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { +define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %add = fmul <3 x double> %vec, %b store <3 x double> %add, <3 x double> addrspace(1)* %out @@ -60,7 +60,7 @@ ; ALL 'fmul_f16' ; ALL estimated cost of 1 for {{.*}} fmul half -define void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { +define amdgpu_kernel void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { %vec = load half, half addrspace(1)* %vaddr %add = fmul half %vec, %b store half %add, half addrspace(1)* %out @@ -69,7 +69,7 @@ ; ALL 'fmul_v2f16' ; ALL estimated cost of 2 for {{.*}} fmul <2 x half> -define void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { +define amdgpu_kernel void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %add = fmul <2 x half> %vec, %b store <2 x half> %add, <2 x half> addrspace(1)* %out @@ -78,7 +78,7 @@ ; ALL 'fmul_v4f16' ; ALL estimated cost of 4 for {{.*}} fmul <4 x half> -define void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { +define amdgpu_kernel void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr %add = fmul <4 x half> %vec, %b store <4 x half> %add, <4 x half> addrspace(1)* %out Index: test/Analysis/CostModel/AMDGPU/fsub.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/fsub.ll +++ test/Analysis/CostModel/AMDGPU/fsub.ll @@ -3,7 +3,7 @@ ; ALL: 'fsub_f32' ; ALL: estimated cost of 1 for {{.*}} fsub float -define void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { +define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { %vec = load float, float addrspace(1)* %vaddr %add = fsub float %vec, %b store float %add, float addrspace(1)* %out @@ -12,7 +12,7 @@ ; ALL: 'fsub_v2f32' ; ALL: estimated cost of 2 for {{.*}} fsub <2 x float> -define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { +define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fsub <2 x float> %vec, %b store <2 x float> %add, <2 x float> addrspace(1)* %out @@ -21,7 +21,7 @@ ; ALL: 'fsub_v3f32' ; ALL: estimated cost of 3 for {{.*}} fsub <3 x float> -define void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { +define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fsub <3 x float> %vec, %b store <3 x float> %add, <3 x float> addrspace(1)* %out @@ -31,7 +31,7 @@ ; ALL: 'fsub_f64' ; FASTF64: estimated cost of 2 for {{.*}} fsub double ; SLOWF64: estimated cost of 3 for {{.*}} fsub double -define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { +define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fsub double %vec, %b store double %add, double addrspace(1)* %out @@ -41,7 +41,7 @@ ; ALL: 'fsub_v2f64' ; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double> ; SLOWF64: estimated cost of 6 for {{.*}} fsub <2 x double> -define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { +define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %add = fsub <2 x double> %vec, %b store <2 x double> %add, <2 x double> addrspace(1)* %out @@ -51,7 +51,7 @@ ; ALL: 'fsub_v3f64' ; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double> ; SLOWF64: estimated cost of 9 for {{.*}} fsub <3 x double> -define void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { +define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %add = fsub <3 x double> %vec, %b store <3 x double> %add, <3 x double> addrspace(1)* %out @@ -60,7 +60,7 @@ ; ALL: 'fsub_f16' ; ALL: estimated cost of 1 for {{.*}} fsub half -define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { +define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { %vec = load half, half addrspace(1)* %vaddr %add = fsub half %vec, %b store half %add, half addrspace(1)* %out @@ -69,7 +69,7 @@ ; ALL: 'fsub_v2f16' ; ALL: estimated cost of 2 for {{.*}} fsub <2 x half> -define void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { +define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %add = fsub <2 x half> %vec, %b store <2 x half> %add, <2 x half> addrspace(1)* %out @@ -78,7 +78,7 @@ ; ALL: 'fsub_v4f16' ; ALL: estimated cost of 4 for {{.*}} fsub <4 x half> -define void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { +define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr %add = fsub <4 x half> %vec, %b store <4 x half> %add, <4 x half> addrspace(1)* %out Index: test/Analysis/CostModel/AMDGPU/insertelement.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/insertelement.ll +++ test/Analysis/CostModel/AMDGPU/insertelement.ll @@ -2,7 +2,7 @@ ; CHECK: 'insertelement_v2i32' ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i32> -define void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) { +define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) { %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr %insert = insertelement <2 x i32> %vec, i32 1, i32 123 store <2 x i32> %insert, <2 x i32> addrspace(1)* %out @@ -11,7 +11,7 @@ ; CHECK: 'insertelement_v2i64' ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i64> -define void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) { +define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) { %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr %insert = insertelement <2 x i64> %vec, i64 1, i64 123 store <2 x i64> %insert, <2 x i64> addrspace(1)* %out @@ -20,7 +20,7 @@ ; CHECK: 'insertelement_v2i16' ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i16> -define void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { +define amdgpu_kernel void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr %insert = insertelement <2 x i16> %vec, i16 1, i16 123 store <2 x i16> %insert, <2 x i16> addrspace(1)* %out @@ -29,7 +29,7 @@ ; CHECK: 'insertelement_v2i8' ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i8> -define void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) { +define amdgpu_kernel void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) { %vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr %insert = insertelement <2 x i8> %vec, i8 1, i8 123 store <2 x i8> %insert, <2 x i8> addrspace(1)* %out Index: test/Analysis/CostModel/AMDGPU/mul.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/mul.ll +++ test/Analysis/CostModel/AMDGPU/mul.ll @@ -2,7 +2,7 @@ ; CHECK: 'mul_i32' ; CHECK: estimated cost of 3 for {{.*}} mul i32 -define void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { +define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr %mul = mul i32 %vec, %b store i32 %mul, i32 addrspace(1)* %out @@ -11,7 +11,7 @@ ; CHECK: 'mul_v2i32' ; CHECK: estimated cost of 6 for {{.*}} mul <2 x i32> -define void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 { +define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 { %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr %mul = mul <2 x i32> %vec, %b store <2 x i32> %mul, <2 x i32> addrspace(1)* %out @@ -20,7 +20,7 @@ ; CHECK: 'mul_v3i32' ; CHECK: estimated cost of 9 for {{.*}} mul <3 x i32> -define void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 { +define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 { %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr %mul = mul <3 x i32> %vec, %b store <3 x i32> %mul, <3 x i32> addrspace(1)* %out @@ -29,7 +29,7 @@ ; CHECK: 'mul_v4i32' ; CHECK: estimated cost of 12 for {{.*}} mul <4 x i32> -define void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 { +define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 { %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr %mul = mul <4 x i32> %vec, %b store <4 x i32> %mul, <4 x i32> addrspace(1)* %out @@ -38,7 +38,7 @@ ; CHECK: 'mul_i64' ; CHECK: estimated cost of 16 for {{.*}} mul i64 -define void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { +define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %mul = mul i64 %vec, %b store i64 %mul, i64 addrspace(1)* %out @@ -47,7 +47,7 @@ ; CHECK: 'mul_v2i64' ; CHECK: estimated cost of 32 for {{.*}} mul <2 x i64> -define void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 { +define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 { %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr %mul = mul <2 x i64> %vec, %b store <2 x i64> %mul, <2 x i64> addrspace(1)* %out @@ -56,7 +56,7 @@ ; CHECK: 'mul_v3i64' ; CHECK: estimated cost of 48 for {{.*}} mul <3 x i64> -define void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 { +define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 { %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr %mul = mul <3 x i64> %vec, %b store <3 x i64> %mul, <3 x i64> addrspace(1)* %out @@ -65,7 +65,7 @@ ; CHECK: 'mul_v4i64' ; CHECK: estimated cost of 64 for {{.*}} mul <4 x i64> -define void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 { +define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 { %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr %mul = mul <4 x i64> %vec, %b store <4 x i64> %mul, <4 x i64> addrspace(1)* %out @@ -75,7 +75,7 @@ ; CHECK: 'mul_v8i64' ; CHECK: estimated cost of 128 for {{.*}} mul <8 x i64> -define void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 { +define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 { %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr %mul = mul <8 x i64> %vec, %b store <8 x i64> %mul, <8 x i64> addrspace(1)* %out Index: test/Analysis/CostModel/AMDGPU/shifts.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/shifts.ll +++ test/Analysis/CostModel/AMDGPU/shifts.ll @@ -3,7 +3,7 @@ ; ALL: 'shl_i32' ; ALL: estimated cost of 1 for {{.*}} shl i32 -define void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { +define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr %or = shl i32 %vec, %b store i32 %or, i32 addrspace(1)* %out @@ -13,7 +13,7 @@ ; ALL: 'shl_i64' ; FAST64: estimated cost of 2 for {{.*}} shl i64 ; SLOW64: estimated cost of 3 for {{.*}} shl i64 -define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { +define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %or = shl i64 %vec, %b store i64 %or, i64 addrspace(1)* %out @@ -22,7 +22,7 @@ ; ALL: 'lshr_i32' ; ALL: estimated cost of 1 for {{.*}} lshr i32 -define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { +define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr %or = lshr i32 %vec, %b store i32 %or, i32 addrspace(1)* %out @@ -32,7 +32,7 @@ ; ALL: 'lshr_i64' ; FAST64: estimated cost of 2 for {{.*}} lshr i64 ; SLOW64: estimated cost of 3 for {{.*}} lshr i64 -define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { +define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %or = lshr i64 %vec, %b store i64 %or, i64 addrspace(1)* %out @@ -41,7 +41,7 @@ ; ALL: 'ashr_i32' ; ALL: estimated cost of 1 for {{.*}} ashr i32 -define void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { +define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr %or = ashr i32 %vec, %b store i32 %or, i32 addrspace(1)* %out @@ -51,7 +51,7 @@ ; ALL: 'ashr_i64' ; FAST64: estimated cost of 2 for {{.*}} ashr i64 ; SLOW64: estimated cost of 3 for {{.*}} ashr i64 -define void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { +define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %or = ashr i64 %vec, %b store i64 %or, i64 addrspace(1)* %out Index: test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll @@ -1,7 +1,7 @@ ; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s ; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 -define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 { +define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 { %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 store i32 %swizzle, i32 addrspace(1)* %out, align 4 ret void Index: test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll @@ -5,7 +5,7 @@ ; CHECK: DIVERGENT: %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4 ; The post dominator tree does not have a root node in this case -define void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 { +define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 { bb0: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp2 = sext i32 %tmp to i64 Index: test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll @@ -1,7 +1,7 @@ ; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s ; CHECK: DIVERGENT: %tmp = cmpxchg volatile -define void @unreachable_loop(i32 %tidx) #0 { +define amdgpu_kernel void @unreachable_loop(i32 %tidx) #0 { entry: unreachable Index: test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll @@ -7,35 +7,35 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 ; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() -define void @workitem_id_x() #1 { +define amdgpu_kernel void @workitem_id_x() #1 { %id.x = call i32 @llvm.amdgcn.workitem.id.x() store volatile i32 %id.x, i32 addrspace(1)* undef ret void } ; CHECK: DIVERGENT: %id.y = call i32 @llvm.amdgcn.workitem.id.y() -define void @workitem_id_y() #1 { +define amdgpu_kernel void @workitem_id_y() #1 { %id.y = call i32 @llvm.amdgcn.workitem.id.y() store volatile i32 %id.y, i32 addrspace(1)* undef ret void } ; CHECK: DIVERGENT: %id.z = call i32 @llvm.amdgcn.workitem.id.z() -define void @workitem_id_z() #1 { +define amdgpu_kernel void @workitem_id_z() #1 { %id.z = call i32 @llvm.amdgcn.workitem.id.z() store volatile i32 %id.z, i32 addrspace(1)* undef ret void } ; CHECK: DIVERGENT: %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) -define void @mbcnt_lo() #1 { +define amdgpu_kernel void @mbcnt_lo() #1 { %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef ret void } ; CHECK: DIVERGENT: %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) -define void @mbcnt_hi() #1 { +define amdgpu_kernel void @mbcnt_hi() #1 { %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef ret void Index: test/CodeGen/AMDGPU/32-bit-local-address-space.ll =================================================================== --- test/CodeGen/AMDGPU/32-bit-local-address-space.ll +++ test/CodeGen/AMDGPU/32-bit-local-address-space.ll @@ -13,7 +13,7 @@ ; FUNC-LABEL: {{^}}local_address_load: ; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]] ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]] -define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +define amdgpu_kernel void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { entry: %0 = load i32, i32 addrspace(3)* %in store i32 %0, i32 addrspace(1)* %out @@ -24,7 +24,7 @@ ; SI: s_add_i32 [[SPTR:s[0-9]]] ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; SI: ds_read_b32 [[VPTR]] -define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) { +define amdgpu_kernel void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) { entry: %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset %1 = load i32, i32 addrspace(3)* %0 @@ -35,7 +35,7 @@ ; FUNC-LABEL: {{^}}local_address_gep_const_offset: ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} ; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4 -define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +define amdgpu_kernel void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { entry: %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1 %1 = load i32, i32 addrspace(3)* %0 @@ -48,7 +48,7 @@ ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; SI: ds_read_b32 [[VPTR]] -define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +define amdgpu_kernel void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { entry: %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385 %1 = load i32, i32 addrspace(3)* %0 @@ -60,7 +60,7 @@ ; SI: v_cmp_ne_u32 ; SI-NOT: v_cmp_ne_u32 ; SI: v_cndmask_b32 -define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind { +define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind { %cmp = icmp ne i32 addrspace(3)* %lds, null %x = select i1 %cmp, i32 123, i32 456 store i32 %x, i32 addrspace(1)* %out @@ -71,7 +71,7 @@ ; SI: s_mul_i32 ; SI-NEXT: s_add_i32 ; SI: ds_read_b32 -define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) { +define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) { %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0 %val = load float, float addrspace(3)* %ptr store float %val, float addrspace(1)* %out @@ -83,7 +83,7 @@ ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset: ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 ; SI: ds_read_b32 v{{[0-9]+}}, [[REG]] -define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) { +define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) { %val = load float, float addrspace(3)* @g_lds store float %val, float addrspace(1)* %out ret void @@ -95,14 +95,14 @@ ; FUNC-LABEL: {{^}}global_ptr: ; SI: ds_write_b32 -define void @global_ptr() nounwind { +define amdgpu_kernel void @global_ptr() nounwind { store i32 addrspace(3)* getelementptr ([16383 x i32], [16383 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr ret void } ; FUNC-LABEL: {{^}}local_address_store: ; SI: ds_write_b32 -define void @local_address_store(i32 addrspace(3)* %out, i32 %val) { +define amdgpu_kernel void @local_address_store(i32 addrspace(3)* %out, i32 %val) { store i32 %val, i32 addrspace(3)* %out ret void } @@ -111,7 +111,7 @@ ; SI: s_add_i32 [[SADDR:s[0-9]+]], ; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]] ; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}} -define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) { +define amdgpu_kernel void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) { %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset store i32 %val, i32 addrspace(3)* %gep, align 4 ret void @@ -121,7 +121,7 @@ ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} ; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} ; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4 -define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) { +define amdgpu_kernel void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) { %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1 store i32 %val, i32 addrspace(3)* %gep, align 4 ret void @@ -132,7 +132,7 @@ ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}} -define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) { +define amdgpu_kernel void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) { %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385 store i32 %val, i32 addrspace(3)* %gep, align 4 ret void Index: test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir +++ test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir @@ -4,7 +4,7 @@ # REQUIRES: global-isel --- | - define void @global_addrspace(i32 addrspace(1)* %global0) { ret void } + define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void } ... --- Index: test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir +++ test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir @@ -5,7 +5,7 @@ # REQUIRES: global-isel --- | - define void @smrd_imm(i32 addrspace(2)* %const0) { ret void } + define amdgpu_kernel void @smrd_imm(i32 addrspace(2)* %const0) { ret void } ... --- Index: test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir +++ test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir @@ -4,7 +4,7 @@ # REQUIRES: global-isel --- | - define void @global_addrspace(i32 addrspace(1)* %global0) { ret void } + define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void } ... --- Index: test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir +++ test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir @@ -3,12 +3,12 @@ # REQUIRES: global-isel --- | - define void @load_constant(i32 addrspace(2)* %ptr0) { ret void } - define void @load_global_uniform(i32 addrspace(1)* %ptr1) { + define amdgpu_kernel void @load_constant(i32 addrspace(2)* %ptr0) { ret void } + define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) { %tmp0 = load i32, i32 addrspace(1)* %ptr1 ret void } - define void @load_global_non_uniform(i32 addrspace(1)* %ptr2) { + define amdgpu_kernel void @load_global_non_uniform(i32 addrspace(1)* %ptr2) { %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp1 = getelementptr i32, i32 addrspace(1)* %ptr2, i32 %tmp0 %tmp2 = load i32, i32 addrspace(1)* %tmp1 Index: test/CodeGen/AMDGPU/GlobalISel/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/smrd.ll +++ test/CodeGen/AMDGPU/GlobalISel/smrd.ll @@ -9,7 +9,7 @@ ; GCN-LABEL: {{^}}smrd0: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 -define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 %1 = load i32, i32 addrspace(2)* %0 @@ -21,7 +21,7 @@ ; GCN-LABEL: {{^}}smrd1: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}} ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 %1 = load i32, i32 addrspace(2)* %0 @@ -36,7 +36,7 @@ ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 ; GCN: s_endpgm -define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 %1 = load i32, i32 addrspace(2)* %0 @@ -51,7 +51,7 @@ ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b ; TODO: Add VI checks ; XGCN: s_endpgm -define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32 %1 = load i32, i32 addrspace(2)* %0 @@ -65,7 +65,7 @@ ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143 %1 = load i32, i32 addrspace(2)* %0 @@ -79,7 +79,7 @@ ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144 %1 = load i32, i32 addrspace(2)* %0 Index: test/CodeGen/AMDGPU/add-debug.ll =================================================================== --- test/CodeGen/AMDGPU/add-debug.ll +++ test/CodeGen/AMDGPU/add-debug.ll @@ -3,7 +3,7 @@ ; REQUIRES: asserts ; Check that SelectionDAGDumper does not crash on int_SI_if. -define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { +define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else Index: test/CodeGen/AMDGPU/add.i16.ll =================================================================== --- test/CodeGen/AMDGPU/add.i16.ll +++ test/CodeGen/AMDGPU/add.i16.ll @@ -6,7 +6,7 @@ ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -23,7 +23,7 @@ ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -38,7 +38,7 @@ ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -53,7 +53,7 @@ ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -69,7 +69,7 @@ ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: buffer_store_dword [[ADD]] -define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -89,7 +89,7 @@ ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] ; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -109,7 +109,7 @@ ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: buffer_store_dword [[SEXT]] -define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -130,7 +130,7 @@ ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid Index: test/CodeGen/AMDGPU/add.ll =================================================================== --- test/CodeGen/AMDGPU/add.ll +++ test/CodeGen/AMDGPU/add.ll @@ -8,7 +8,7 @@ ;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}} ;SI-NOT: [[REG]] ;SI: buffer_store_dword [[REG]], -define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -24,7 +24,7 @@ ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr @@ -44,7 +44,7 @@ ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr @@ -71,7 +71,7 @@ ; SI: s_add_i32 ; SI: s_add_i32 ; SI: s_add_i32 -define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) { +define amdgpu_kernel void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) { entry: %0 = add <8 x i32> %a, %b store <8 x i32> %0, <8 x i32> addrspace(1)* %out @@ -112,7 +112,7 @@ ; SI: s_add_i32 ; SI: s_add_i32 ; SI: s_add_i32 -define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) { +define amdgpu_kernel void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) { entry: %0 = add <16 x i32> %a, %b store <16 x i32> %0, <16 x i32> addrspace(1)* %out @@ -129,7 +129,7 @@ ; EG-DAG: ADD_INT ; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB -define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { entry: %0 = add i64 %a, %b store i64 %0, i64 addrspace(1)* %out @@ -150,7 +150,7 @@ ; EG-DAG: ADD_INT ; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB -define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) { +define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) { entry: %0 = load i64, i64 addrspace(1)* %in %1 = add i64 %a, %0 @@ -169,7 +169,7 @@ ; EG-DAG: ADD_INT ; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB -define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { +define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else Index: test/CodeGen/AMDGPU/add.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/add.v2i16.ll +++ test/CodeGen/AMDGPU/add.v2i16.ll @@ -7,7 +7,7 @@ ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -27,7 +27,7 @@ ; VI: s_add_i32 ; VI: s_add_i32 -define void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 { +define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 { %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1 %add = add <2 x i16> %a, %b @@ -41,7 +41,7 @@ ; VI: s_add_i32 ; VI: s_add_i32 -define void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 { +define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 { %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 %add = add <2 x i16> %a, %a store <2 x i16> %add, <2 x i16> addrspace(1)* %out @@ -54,7 +54,7 @@ ; VI: v_add_i32 ; VI: v_add_i32 -define void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { +define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { %add = add <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void @@ -66,7 +66,7 @@ ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}} -define void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -83,7 +83,7 @@ ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}} ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}} -define void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -102,7 +102,7 @@ ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]] ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_e32 -define void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -121,7 +121,7 @@ ; VI-NOT: v_add_u16 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_e32 -define void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -141,7 +141,7 @@ ; VI-NOT: v_add_u16 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_e32 -define void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -173,7 +173,7 @@ ; VI-NOT: and ; VI-NOT: shl ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} -define void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -208,7 +208,7 @@ ; VI: v_add_u16_e32 ; VI: buffer_store_dwordx4 -define void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -236,7 +236,7 @@ ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; VI: buffer_store_dwordx2 -define void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -264,7 +264,7 @@ ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -define void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid Index: test/CodeGen/AMDGPU/add_i128.ll =================================================================== --- test/CodeGen/AMDGPU/add_i128.ll +++ test/CodeGen/AMDGPU/add_i128.ll @@ -6,7 +6,7 @@ ; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc ; GCN-NEXT: v_addc_u32_e32 v[[HI:[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]], -define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) { +define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) { %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr i128, i128 addrspace(1)* %inA, i32 %tid %b_ptr = getelementptr i128, i128 addrspace(1)* %inB, i32 %tid @@ -23,7 +23,7 @@ ; GCN: v_addc_u32 ; GCN: v_addc_u32 ; GCN: v_addc_u32 -define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { +define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { %foo = load i128, i128 addrspace(1)* %in, align 8 %result = add i128 %foo, %a store i128 %result, i128 addrspace(1)* %out @@ -35,7 +35,7 @@ ; GCN: v_addc_u32 ; GCN: v_addc_u32 ; GCN: v_addc_u32 -define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { +define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { %foo = load i128, i128 addrspace(1)* %in, align 8 %result = add i128 %a, %foo store i128 %result, i128 addrspace(1)* %out @@ -47,7 +47,7 @@ ; GCN: s_addc_u32 ; GCN: s_addc_u32 ; GCN: s_addc_u32 -define void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) { +define amdgpu_kernel void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) { %result = add i128 %a, %b store i128 %result, i128 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/add_i64.ll =================================================================== --- test/CodeGen/AMDGPU/add_i64.ll +++ test/CodeGen/AMDGPU/add_i64.ll @@ -6,7 +6,7 @@ ; SI-LABEL: {{^}}test_i64_vreg: ; SI: v_add_i32 ; SI: v_addc_u32 -define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) { +define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) { %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid @@ -21,7 +21,7 @@ ; SI-LABEL: {{^}}sgpr_operand: ; SI: v_add_i32 ; SI: v_addc_u32 -define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) { +define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) { %foo = load i64, i64 addrspace(1)* %in, align 8 %result = add i64 %foo, %a store i64 %result, i64 addrspace(1)* %out @@ -34,7 +34,7 @@ ; SI-LABEL: {{^}}sgpr_operand_reversed: ; SI: v_add_i32 ; SI: v_addc_u32 -define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) { +define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) { %foo = load i64, i64 addrspace(1)* %in, align 8 %result = add i64 %a, %foo store i64 %result, i64 addrspace(1)* %out @@ -47,7 +47,7 @@ ; SI: s_addc_u32 ; SI: s_add_u32 ; SI: s_addc_u32 -define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) { +define amdgpu_kernel void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) { %result = add <2 x i64> %a, %b store <2 x i64> %result, <2 x i64> addrspace(1)* %out ret void @@ -58,7 +58,7 @@ ; SI: v_addc_u32 ; SI: v_add_i32 ; SI: v_addc_u32 -define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { +define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid @@ -76,7 +76,7 @@ ; SI-NOT: addc ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; SI: buffer_store_dword [[VRESULT]], -define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { %add = add i64 %b, %a %trunc = trunc i64 %add to i32 store i32 %trunc, i32 addrspace(1)* %out, align 8 Index: test/CodeGen/AMDGPU/addrspacecast-captured.ll =================================================================== --- test/CodeGen/AMDGPU/addrspacecast-captured.ll +++ test/CodeGen/AMDGPU/addrspacecast-captured.ll @@ -9,7 +9,7 @@ ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)* ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32 ; CHECK: store i32 %ptr2int, i32 addrspace(1)* %out -define void @addrspacecast_captured(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @addrspacecast_captured(i32 addrspace(1)* %out) #0 { entry: %data = alloca i32, align 4 %cast = addrspacecast i32* %data to i32 addrspace(4)* @@ -22,7 +22,7 @@ ; CHECK: %data = alloca i32, align 4 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)* ; CHECK: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out -define void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 { +define amdgpu_kernel void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 { entry: %data = alloca i32, align 4 %cast = addrspacecast i32* %data to i32 addrspace(4)* @@ -35,7 +35,7 @@ ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)* ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32 ; CHECK: call void @consume_ptr2int(i32 %ptr2int) -define void @addrspacecast_captured_call() #0 { +define amdgpu_kernel void @addrspacecast_captured_call() #0 { entry: %data = alloca i32, align 4 %cast = addrspacecast i32* %data to i32 addrspace(4)* Index: test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll =================================================================== --- test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -9,57 +9,57 @@ @global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4 ; HSA: @store_cast_0_flat_to_group_addrspacecast() #1 -define void @store_cast_0_flat_to_group_addrspacecast() #1 { +define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 { store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*) ret void } ; HSA: @store_cast_0_group_to_flat_addrspacecast() #2 -define void @store_cast_0_group_to_flat_addrspacecast() #1 { +define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 { store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*) ret void } -; HSA: define void @store_constant_cast_group_gv_to_flat() #2 -define void @store_constant_cast_group_gv_to_flat() #1 { +; HSA: define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #2 +define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 { store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*) ret void } ; HSA: @store_constant_cast_group_gv_gep_to_flat() #2 -define void @store_constant_cast_group_gv_gep_to_flat() #1 { +define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 { store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) ret void } ; HSA: @store_constant_cast_global_gv_to_flat() #1 -define void @store_constant_cast_global_gv_to_flat() #1 { +define amdgpu_kernel void @store_constant_cast_global_gv_to_flat() #1 { store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global.i32 to i32 addrspace(4)*) ret void } ; HSA: @store_constant_cast_global_gv_gep_to_flat() #1 -define void @store_constant_cast_global_gv_gep_to_flat() #1 { +define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 { store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(1)* @global.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) ret void } ; HSA: @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2 -define void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { %val = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) store i32 %val, i32 addrspace(1)* %out ret void } ; HSA: @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2 -define void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { %val = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst store i32 %val, i32 addrspace(1)* %out ret void } ; HSA: @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2 -define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { %val = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out @@ -67,28 +67,28 @@ } ; HSA: @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2 -define void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* %out, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i32 4, i1 false) ret void } ; Can't just search the pointer value ; HSA: @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #2 -define void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 { +define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 { store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* %out ret void } ; Can't just search pointer types ; HSA: @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #2 -define void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 { +define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 { store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* %out ret void } ; Cast group to flat, do GEP, cast back to group ; HSA: @store_constant_cast_group_gv_gep_to_flat_to_group() #2 -define void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 { +define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 { store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*) ret void } Index: test/CodeGen/AMDGPU/addrspacecast.ll =================================================================== --- test/CodeGen/AMDGPU/addrspacecast.ll +++ test/CodeGen/AMDGPU/addrspacecast.ll @@ -28,7 +28,7 @@ ; CI: NumSgprs: {{[0-9][0-9]+}} ; GFX9: NumSgprs: {{[0-9]+}} -define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* store volatile i32 7, i32 addrspace(4)* %stof ret void @@ -58,7 +58,7 @@ ; CI: NumSgprs: {{[0-9][0-9]+}} ; GFX9: NumSgprs: {{[0-9]+}} -define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 { +define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #0 { %stof = addrspacecast i32* %ptr to i32 addrspace(4)* store volatile i32 7, i32 addrspace(4)* %stof ret void @@ -73,7 +73,7 @@ ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] -define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 { %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)* store volatile i32 7, i32 addrspace(4)* %stof ret void @@ -85,7 +85,7 @@ ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}} -define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 { %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)* %ld = load volatile i32, i32 addrspace(4)* %stof ret void @@ -102,7 +102,7 @@ ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} ; HSA: ds_write_b32 [[CASTPTR]], v[[K]] -define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)* store volatile i32 0, i32 addrspace(3)* %ftos ret void @@ -119,7 +119,7 @@ ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]] ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} -define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32* store volatile i32 0, i32* %ftos ret void @@ -133,7 +133,7 @@ ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] -define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)* store volatile i32 0, i32 addrspace(1)* %ftos ret void @@ -144,7 +144,7 @@ ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0 -define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)* load volatile i32, i32 addrspace(2)* %ftos ret void @@ -158,7 +158,7 @@ ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] -define void @cast_0_group_to_flat_addrspacecast() #0 { +define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)* store volatile i32 7, i32 addrspace(4)* %cast ret void @@ -168,7 +168,7 @@ ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} ; HSA: ds_write_b32 [[PTR]], [[K]] -define void @cast_0_flat_to_group_addrspacecast() #0 { +define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)* store volatile i32 7, i32 addrspace(3)* %cast ret void @@ -179,7 +179,7 @@ ; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] -define void @cast_neg1_group_to_flat_addrspacecast() #0 { +define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)* store volatile i32 7, i32 addrspace(4)* %cast ret void @@ -189,7 +189,7 @@ ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} ; HSA: ds_write_b32 [[PTR]], [[K]] -define void @cast_neg1_flat_to_group_addrspacecast() #0 { +define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)* store volatile i32 7, i32 addrspace(3)* %cast ret void @@ -204,7 +204,7 @@ ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] -define void @cast_0_private_to_flat_addrspacecast() #0 { +define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 { %cast = addrspacecast i32* null to i32 addrspace(4)* store volatile i32 7, i32 addrspace(4)* %cast ret void @@ -214,7 +214,7 @@ ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} ; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -define void @cast_0_flat_to_private_addrspacecast() #0 { +define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)* store volatile i32 7, i32* %cast ret void @@ -226,7 +226,7 @@ ; HSA-LABEL: {{^}}branch_use_flat_i32: ; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} ; HSA: s_endpgm -define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { +define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { entry: %cmp = icmp ne i32 %c, 0 br i1 %cmp, label %local, label %global @@ -259,7 +259,7 @@ ; HSA: flat_store_dword ; HSA: s_barrier ; HSA: flat_load_dword -define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { +define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { %alloca = alloca i32, i32 9, align 4 %x = call i32 @llvm.amdgcn.workitem.id.x() #2 %pptr = getelementptr i32, i32* %alloca, i32 %x Index: test/CodeGen/AMDGPU/amdgcn.bitcast.ll =================================================================== --- test/CodeGen/AMDGPU/amdgcn.bitcast.ll +++ test/CodeGen/AMDGPU/amdgcn.bitcast.ll @@ -16,7 +16,7 @@ ; FUNC-LABEL: {{^}}i8ptr_v16i8ptr: ; SI: s_endpgm -define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)* %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0 @@ -24,7 +24,7 @@ ret void } -define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { %load = load float, float addrspace(1)* %in, align 4 %fadd32 = fadd float %load, 1.0 %bc = bitcast float %fadd32 to <2 x i16> @@ -33,7 +33,7 @@ ret void } -define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 %add.v2i16 = add <2 x i16> %load, %bc = bitcast <2 x i16> %add.v2i16 to float @@ -42,7 +42,7 @@ ret void } -define void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind { %load = load float, float addrspace(1)* %in, align 4 %fadd32 = fadd float %load, 1.0 %bc = bitcast float %fadd32 to <2 x half> @@ -51,7 +51,7 @@ ret void } -define void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind { %load = load <2 x half>, <2 x half> addrspace(1)* %in, align 4 %add.v2f16 = fadd <2 x half> %load, %bc = bitcast <2 x half> %add.v2f16 to float @@ -60,14 +60,14 @@ ret void } -define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 %bc = bitcast <4 x i8> %load to i32 store i32 %bc, i32 addrspace(1)* %out, align 4 ret void } -define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %bc = bitcast i32 %load to <4 x i8> store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4 @@ -76,7 +76,7 @@ ; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64: ; SI: s_endpgm -define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 %add = add <2 x i32> %val, %bc = bitcast <2 x i32> %add to double @@ -87,7 +87,7 @@ ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32: ; SI: s_endpgm -define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) { %val = load double, double addrspace(1)* %in, align 8 %add = fadd double %val, 4.0 %bc = bitcast double %add to <2 x i32> @@ -96,7 +96,7 @@ } ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64: -define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) { +define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) { entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -112,7 +112,7 @@ } ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64: -define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) { +define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) { entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end Index: test/CodeGen/AMDGPU/amdgcn.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/amdgcn.private-memory.ll +++ test/CodeGen/AMDGPU/amdgcn.private-memory.ll @@ -15,7 +15,7 @@ ; GCN-NOT: v0 ; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}} ; GCN: buffer_store_dword [[RESULT]] -define void @work_item_info(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) { entry: %0 = alloca [2 x i32] %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0 Index: test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -4,7 +4,7 @@ ; NOOP-LABEL: @noop_fdiv_fpmath( ; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0 -define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 { +define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 { %md.25ulp = fdiv float %a, %b, !fpmath !0 store volatile float %md.25ulp, float addrspace(1)* %out ret void @@ -18,7 +18,7 @@ ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3 ; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 ; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 -define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 { +define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 { %no.md = fdiv float %a, %b store volatile float %no.md, float addrspace(1)* %out @@ -51,7 +51,7 @@ ; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0 ; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}} ; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0 -define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 { +define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 { %no.md = fdiv float 1.0, %x store volatile float %no.md, float addrspace(1)* %out @@ -89,7 +89,7 @@ ; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1 ; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0 ; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1 -define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 { +define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 { %no.md = fdiv <2 x float> %a, %b store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out @@ -120,7 +120,7 @@ ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out -define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { +define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { %no.md = fdiv <2 x float> , %x store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out @@ -158,7 +158,7 @@ ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1 ; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0 ; CHECK: store volatile <2 x float> %fast.25ulp -define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { +define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { %no.md = fdiv <2 x float> , %x store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out @@ -186,7 +186,7 @@ ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 ; CHECK: store volatile <2 x float> %fast.25ulp -define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 { +define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 { %x.insert = insertelement <2 x float> %x, float 1.0, i32 0 %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0 @@ -206,7 +206,7 @@ ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 ; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 -define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { +define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { %no.md = fdiv float %a, %b store volatile float %no.md, float addrspace(1)* %out Index: test/CodeGen/AMDGPU/amdgpu.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -80,7 +80,7 @@ ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0 -define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -102,7 +102,7 @@ ; OPT-LABEL: @high_alignment( ; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}} -define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [8 x i32], align 16 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -127,7 +127,7 @@ ; OPT: alloca [5 x i32] ; SI-NOT: ds_write -define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -162,7 +162,7 @@ ; SI-NOT: v_movrel %struct.point = type { i32, i32 } -define void @multiple_structs(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 { entry: %a = alloca %struct.point %b = alloca %struct.point @@ -191,7 +191,7 @@ ; R600-NOT: MOVA_INT ; SI-NOT: v_movrel -define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { entry: %prv_array_const = alloca [2 x i32] %prv_array = alloca [2 x i32] @@ -235,7 +235,7 @@ ; SI-PROMOTE: s_load_dword [[IDX:s[0-9]+]] ; SI-PROMOTE: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16 ; SI-PROMOTE: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[SCALED_IDX]], 16 -define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i16] %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0 @@ -258,7 +258,7 @@ ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0 -define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i8] %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0 @@ -281,7 +281,7 @@ ; ; A total of 5 bytes should be allocated and used. ; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; -define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { entry: %0 = alloca [3 x i8], align 1 %1 = alloca [2 x i8], align 1 @@ -305,7 +305,7 @@ ret void } -define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i8]] %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0 @@ -319,7 +319,7 @@ ret void } -define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i32]] %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 @@ -332,7 +332,7 @@ ret void } -define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i64]] %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0 @@ -347,7 +347,7 @@ %struct.pair32 = type { i32, i32 } -define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x %struct.pair32]] %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1 @@ -360,7 +360,7 @@ ret void } -define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x %struct.pair32] %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1 @@ -373,7 +373,7 @@ ret void } -define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { +define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { entry: %tmp = alloca [2 x i32] %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 @@ -394,7 +394,7 @@ ; SI-NOT: ds_write ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ; -define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32] %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a store i32 5, i32* %tmp0 @@ -410,7 +410,7 @@ ; OPT-LABEL: @pointer_typed_alloca( ; OPT: getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}} ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4 -define void @pointer_typed_alloca(i32 addrspace(1)* %A) { +define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) { entry: %A.addr = alloca i32 addrspace(1)*, align 4 store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 @@ -462,7 +462,7 @@ ; SI: buffer_load_dword ; SI: buffer_load_dword -define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) { %alloca = alloca [2 x <16 x i32>] %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a %tmp5 = load <16 x i32>, <16 x i32>* %tmp0 @@ -506,7 +506,7 @@ ; SI: buffer_load_dword ; SI: buffer_load_dword -define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) { %alloca = alloca [2 x <16 x float>] %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a %tmp5 = load <16 x float>, <16 x float>* %tmp0 @@ -522,7 +522,7 @@ ; SI: buffer_load_dword ; SI: buffer_load_dword -define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) { %alloca = alloca [16 x <2 x float>] %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a %tmp5 = load <2 x float>, <2 x float>* %tmp0 @@ -533,7 +533,7 @@ ; OPT-LABEL: @direct_alloca_read_0xi32( ; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)* ; OPT: load [0 x i32], [0 x i32] addrspace(3)* -define void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) { entry: %tmp = alloca [0 x i32] store [0 x i32] [], [0 x i32]* %tmp @@ -545,7 +545,7 @@ ; OPT-LABEL: @direct_alloca_read_1xi32( ; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)* ; OPT: load [1 x i32], [1 x i32] addrspace(3)* -define void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) { entry: %tmp = alloca [1 x i32] store [1 x i32] [i32 0], [1 x i32]* %tmp Index: test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll +++ test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll @@ -12,7 +12,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[0].X -define void @ngroups_x (i32 addrspace(1)* %out) { +define amdgpu_kernel void @ngroups_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -27,7 +27,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y -define void @ngroups_y (i32 addrspace(1)* %out) { +define amdgpu_kernel void @ngroups_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -42,7 +42,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z -define void @ngroups_z (i32 addrspace(1)* %out) { +define amdgpu_kernel void @ngroups_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -57,7 +57,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[0].W -define void @global_size_x (i32 addrspace(1)* %out) { +define amdgpu_kernel void @global_size_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -72,7 +72,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[1].X -define void @global_size_y (i32 addrspace(1)* %out) { +define amdgpu_kernel void @global_size_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -87,7 +87,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y -define void @global_size_z (i32 addrspace(1)* %out) { +define amdgpu_kernel void @global_size_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -102,7 +102,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Z -define void @local_size_x (i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.local.size.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -117,7 +117,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[1].W -define void @local_size_y (i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.local.size.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -132,7 +132,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[2].X -define void @local_size_z (i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.local.size.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -153,7 +153,7 @@ ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 -define void @tgid_x_legacy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tgid_x_legacy(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -165,7 +165,7 @@ ; GCN-NOHSA: buffer_store_dword [[VVAL]] ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 -define void @tgid_y_legacy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tgid_y_legacy(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -181,7 +181,7 @@ ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 -define void @tgid_z_legacy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tgid_z_legacy(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -194,7 +194,7 @@ ; FUNC-LABEL: {{^}}tidig_x_legacy: ; GCN-NOHSA: buffer_store_dword v0 -define void @tidig_x_legacy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tidig_x_legacy(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -208,7 +208,7 @@ ; FUNC-LABEL: {{^}}tidig_y_legacy: ; GCN-NOHSA: buffer_store_dword v1 -define void @tidig_y_legacy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tidig_y_legacy(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -221,7 +221,7 @@ ; FUNC-LABEL: {{^}}tidig_z_legacy: ; GCN-NOHSA: buffer_store_dword v2 -define void @tidig_z_legacy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tidig_z_legacy(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.z() #0 store i32 %0, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/and-gcn.ll =================================================================== --- test/CodeGen/AMDGPU/and-gcn.ll +++ test/CodeGen/AMDGPU/and-gcn.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}v_and_i64_br: ; SI: v_and_b32 ; SI: v_and_b32 -define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { +define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { entry: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 Index: test/CodeGen/AMDGPU/and.ll =================================================================== --- test/CodeGen/AMDGPU/and.ll +++ test/CodeGen/AMDGPU/and.ll @@ -11,7 +11,7 @@ ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr @@ -31,7 +31,7 @@ ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr @@ -42,7 +42,7 @@ ; FUNC-LABEL: {{^}}s_and_i32: ; SI: s_and_b32 -define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %and = and i32 %a, %b store i32 %and, i32 addrspace(1)* %out, align 4 ret void @@ -50,7 +50,7 @@ ; FUNC-LABEL: {{^}}s_and_constant_i32: ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687 -define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) { %and = and i32 %a, 1234567 store i32 %and, i32 addrspace(1)* %out, align 4 ret void @@ -66,7 +66,7 @@ ; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]] ; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] ; SI: buffer_store_dword [[VK]] -define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) { %and = and i32 %a, 1234567 ; Just to stop future replacement of copy to vgpr + store with VALU op. @@ -83,7 +83,7 @@ ; SI: s_add_i32 ; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]] ; SI: buffer_store_dword [[VK]] -define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) { %and = and i32 %a, 1234567 %foo = add i32 %and, 1234567 %bar = add i32 %foo, %b @@ -93,7 +93,7 @@ ; FUNC-LABEL: {{^}}v_and_i32_vgpr_vgpr: ; SI: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) { +define amdgpu_kernel void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid @@ -109,7 +109,7 @@ ; SI-DAG: s_load_dword [[SA:s[0-9]+]] ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]] ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]] -define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) { +define amdgpu_kernel void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -123,7 +123,7 @@ ; SI-DAG: s_load_dword [[SA:s[0-9]+]] ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]] ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]] -define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) { +define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -135,7 +135,7 @@ ; FUNC-LABEL: {{^}}v_and_constant_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}} -define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { %a = load i32, i32 addrspace(1)* %aptr, align 4 %and = and i32 %a, 1234567 store i32 %and, i32 addrspace(1)* %out, align 4 @@ -144,7 +144,7 @@ ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}} -define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { %a = load i32, i32 addrspace(1)* %aptr, align 4 %and = and i32 %a, 64 store i32 %and, i32 addrspace(1)* %out, align 4 @@ -153,7 +153,7 @@ ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}} -define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { %a = load i32, i32 addrspace(1)* %aptr, align 4 %and = and i32 %a, -16 store i32 %and, i32 addrspace(1)* %out, align 4 @@ -162,7 +162,7 @@ ; FUNC-LABEL: {{^}}s_and_i64 ; SI: s_and_b64 -define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %and = and i64 %a, %b store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -171,7 +171,7 @@ ; FIXME: Should use SGPRs ; FUNC-LABEL: {{^}}s_and_i1: ; SI: v_and_b32 -define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) { +define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) { %and = and i1 %a, %b store i1 %and, i1 addrspace(1)* %out ret void @@ -181,7 +181,7 @@ ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000{{$}} ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80{{$}} ; SI: buffer_store_dwordx2 -define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) { %and = and i64 %a, 549756338176 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -191,7 +191,7 @@ ; XSI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x80000{{$}} ; XSI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0x80{{$}} ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}} -define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %and0 = and i64 %a, 549756338176 %and1 = and i64 %b, 549756338176 store volatile i64 %and0, i64 addrspace(1)* %out @@ -205,7 +205,7 @@ ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}} ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) { %and = and i64 %a, 1234567 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -223,7 +223,7 @@ ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) { +define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) { %shl.a = shl i64 %a, 1 %shl.b = shl i64 %b, 1 %and0 = and i64 %shl.a, 62 @@ -238,7 +238,7 @@ ; FUNC-LABEL: {{^}}v_and_i64: ; SI: v_and_b32 ; SI: v_and_b32 -define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { +define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 %and = and i64 %a, %b @@ -250,7 +250,7 @@ ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}} ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}} ; SI: buffer_store_dwordx2 -define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %and = and i64 %a, 1231231234567 store i64 %and, i64 addrspace(1)* %out, align 8 @@ -268,7 +268,7 @@ ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]] ; SI: buffer_store_dwordx2 ; SI: buffer_store_dwordx2 -define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load volatile i64, i64 addrspace(1)* %aptr %b = load volatile i64, i64 addrspace(1)* %aptr %and0 = and i64 %a, 1231231234567 @@ -288,7 +288,7 @@ ; SI-NOT: and ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]] ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]] -define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load volatile i64, i64 addrspace(1)* %aptr %b = load volatile i64, i64 addrspace(1)* %aptr %and0 = and i64 %a, 63 @@ -304,7 +304,7 @@ ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]] ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %and = and i64 %a, 1234567 store i64 %and, i64 addrspace(1)* %out, align 8 @@ -317,7 +317,7 @@ ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}} ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %and = and i64 %a, 64 store i64 %and, i64 addrspace(1)* %out, align 8 @@ -331,7 +331,7 @@ ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]] ; SI-NOT: and ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} -define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %and = and i64 %a, -8 store i64 %and, i64 addrspace(1)* %out, align 8 @@ -344,7 +344,7 @@ ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 64 ; SI-NOT: and ; SI: buffer_store_dword -define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 64 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -358,7 +358,7 @@ ; SI-NOT: and ; SI: s_add_u32 ; SI-NEXT: s_addc_u32 -define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) { +define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) { %shl = shl i64 %a, 1 %and = and i64 %shl, 64 %add = add i64 %and, %b @@ -372,7 +372,7 @@ ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 1 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -387,7 +387,7 @@ ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 4607182418800017408 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -402,7 +402,7 @@ ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 13830554455654793216 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -417,7 +417,7 @@ ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 4602678819172646912 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -432,7 +432,7 @@ ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 13826050856027422720 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -445,7 +445,7 @@ ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 4611686018427387904 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -458,7 +458,7 @@ ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 13835058055282163712 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -473,7 +473,7 @@ ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 4616189618054758400 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -488,7 +488,7 @@ ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 13839561654909534208 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -505,7 +505,7 @@ ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 1082130432 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -518,7 +518,7 @@ ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, -1065353216 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -531,7 +531,7 @@ ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 4647714815446351872 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -544,7 +544,7 @@ ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 13871086852301127680 store i64 %and, i64 addrspace(1)* %out, align 8 ret void Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll =================================================================== --- test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -11,22 +11,22 @@ declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 -; HSA: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { -define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 { -define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 { +define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.amdgcn.workgroup.id.y() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 { -define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 { +define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.y() store volatile i32 %val0, i32 addrspace(1)* %ptr %val1 = call i32 @llvm.amdgcn.workgroup.id.y() @@ -34,8 +34,8 @@ ret void } -; HSA: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 { -define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 { +define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -43,15 +43,15 @@ ret void } -; HSA: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 { -define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 { +define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.amdgcn.workgroup.id.z() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 { -define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 { +define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.z() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -59,8 +59,8 @@ ret void } -; HSA: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 { -define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 { +define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.y() %val1 = call i32 @llvm.amdgcn.workgroup.id.z() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -68,8 +68,8 @@ ret void } -; HSA: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 { -define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 { +define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() %val2 = call i32 @llvm.amdgcn.workgroup.id.z() @@ -79,29 +79,29 @@ ret void } -; HSA: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { -define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.amdgcn.workitem.id.x() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 { -define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 { +define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.amdgcn.workitem.id.y() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 { -define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 { +define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.amdgcn.workitem.id.z() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { -define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.x() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -109,8 +109,8 @@ ret void } -; HSA: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 { -define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 { +define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workitem.id.y() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -118,8 +118,8 @@ ret void } -; HSA: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 { -define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 { +define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.y() %val2 = call i32 @llvm.amdgcn.workitem.id.z() @@ -129,8 +129,8 @@ ret void } -; HSA: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 { -define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 { +define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.y() %val2 = call i32 @llvm.amdgcn.workitem.id.z() @@ -146,8 +146,8 @@ ret void } -; HSA: define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 { -define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 { +define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 { %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* %val = load i32, i32 addrspace(2)* %bc @@ -155,8 +155,8 @@ ret void } -; HSA: define void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 { -define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 { +define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 { %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* %val = load i32, i32 addrspace(2)* %bc @@ -164,58 +164,58 @@ ret void } -; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 { -define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 { +define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* store volatile i32 0, i32 addrspace(4)* %stof ret void } -; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 { -define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #11 { +define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #1 { %stof = addrspacecast i32* %ptr to i32 addrspace(4)* store volatile i32 0, i32 addrspace(4)* %stof ret void } -; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 { -define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)* store volatile i32 0, i32 addrspace(3)* %ftos ret void } -; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 { -define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32* store volatile i32 0, i32* %ftos ret void } ; No-op addrspacecast should not use queue ptr -; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 { -define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 { %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)* store volatile i32 0, i32 addrspace(4)* %stof ret void } -; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { -define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { +define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)* %ld = load volatile i32, i32 addrspace(4)* %stof ret void } -; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 { -define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)* store volatile i32 0, i32 addrspace(1)* %ftos ret void } -; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 { -define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)* %ld = load volatile i32, i32 addrspace(2)* %ftos ret void Index: test/CodeGen/AMDGPU/annotate-kernel-features.ll =================================================================== --- test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -12,22 +12,22 @@ declare i32 @llvm.r600.read.local.size.y() #0 declare i32 @llvm.r600.read.local.size.z() #0 -; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { -define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.tgid.x() store i32 %val, i32 addrspace(1)* %ptr ret void } -; ALL: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 { -define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 { +define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.tgid.y() store i32 %val, i32 addrspace(1)* %ptr ret void } -; ALL: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 { -define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 { +define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tgid.y() store volatile i32 %val0, i32 addrspace(1)* %ptr %val1 = call i32 @llvm.r600.read.tgid.y() @@ -35,8 +35,8 @@ ret void } -; ALL: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 { -define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 { +define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tgid.x() %val1 = call i32 @llvm.r600.read.tgid.y() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -44,15 +44,15 @@ ret void } -; ALL: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 { -define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 { +define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.tgid.z() store i32 %val, i32 addrspace(1)* %ptr ret void } -; ALL: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 { -define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 { +define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tgid.x() %val1 = call i32 @llvm.r600.read.tgid.z() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -60,8 +60,8 @@ ret void } -; ALL: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 { -define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 { +define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tgid.y() %val1 = call i32 @llvm.r600.read.tgid.z() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -69,8 +69,8 @@ ret void } -; ALL: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 { -define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 { +define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tgid.x() %val1 = call i32 @llvm.r600.read.tgid.y() %val2 = call i32 @llvm.r600.read.tgid.z() @@ -80,29 +80,29 @@ ret void } -; ALL: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { -define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.tidig.x() store i32 %val, i32 addrspace(1)* %ptr ret void } -; ALL: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 { -define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 { +define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.tidig.y() store i32 %val, i32 addrspace(1)* %ptr ret void } -; ALL: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 { -define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 { +define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.tidig.z() store i32 %val, i32 addrspace(1)* %ptr ret void } -; ALL: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { -define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tidig.x() %val1 = call i32 @llvm.r600.read.tgid.x() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -110,8 +110,8 @@ ret void } -; ALL: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 { -define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 { +define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tidig.y() %val1 = call i32 @llvm.r600.read.tgid.y() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -119,8 +119,8 @@ ret void } -; ALL: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 { -define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 { +define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tidig.x() %val1 = call i32 @llvm.r600.read.tidig.y() %val2 = call i32 @llvm.r600.read.tidig.z() @@ -130,8 +130,8 @@ ret void } -; ALL: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 { -define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 { +define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tidig.x() %val1 = call i32 @llvm.r600.read.tidig.y() %val2 = call i32 @llvm.r600.read.tidig.z() @@ -147,25 +147,25 @@ ret void } -; HSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 { -; NOHSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 { -define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 { +; NOHSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.local.size.x() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 { -; NOHSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 { -define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 { +; NOHSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.local.size.y() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 { -; NOHSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 { -define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 { +; NOHSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.local.size.z() store i32 %val, i32 addrspace(1)* %ptr ret void Index: test/CodeGen/AMDGPU/anonymous-gv.ll =================================================================== --- test/CodeGen/AMDGPU/anonymous-gv.ll +++ test/CodeGen/AMDGPU/anonymous-gv.ll @@ -6,13 +6,13 @@ ; CHECK-LABEL: {{^}}test: ; CHECK: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, __unnamed_1 ; CHECK: s_endpgm -define void @test() { +define amdgpu_kernel void @test() { store i32 1, i32 addrspace(1)* @0 ret void } ; CHECK-LABEL: {{^}}__unnamed_2: ; CHECK: s_endpgm -define void @1() { +define amdgpu_kernel void @1() { ret void } Index: test/CodeGen/AMDGPU/anyext.ll =================================================================== --- test/CodeGen/AMDGPU/anyext.ll +++ test/CodeGen/AMDGPU/anyext.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}anyext_i1_i32: ; GCN: v_cndmask_b32_e64 -define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) { entry: %tmp = icmp eq i32 %cond, 0 %tmp1 = zext i1 %tmp to i8 @@ -22,7 +22,7 @@ ; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], -1, [[ADD]] ; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[XOR]] ; VI: buffer_store_dword [[AND]] -define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) { +define amdgpu_kernel void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) { entry: %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() Index: test/CodeGen/AMDGPU/array-ptr-calc-i32.ll =================================================================== --- test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -24,7 +24,7 @@ ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64 ; SI-PROMOTE: ds_write_b32 [[PTRREG]] -define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 { +define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 { %alloca = alloca [16 x i32], align 16 %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0); %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo) Index: test/CodeGen/AMDGPU/array-ptr-calc-i64.ll =================================================================== --- test/CodeGen/AMDGPU/array-ptr-calc-i64.ll +++ test/CodeGen/AMDGPU/array-ptr-calc-i64.ll @@ -7,7 +7,7 @@ ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_mul_hi_i32 ; SI: s_endpgm -define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { +define amdgpu_kernel void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo) %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0 Index: test/CodeGen/AMDGPU/ashr.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/ashr.v2i16.ll +++ test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -13,7 +13,7 @@ ; CIVI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CIVI: v_or_b32_e32 -define void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { +define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = ashr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out ret void @@ -40,7 +40,7 @@ ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -57,7 +57,7 @@ ; GFX9: s_load_dword [[RHS:s[0-9]+]] ; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] -define void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { +define amdgpu_kernel void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -72,7 +72,7 @@ ; GFX9: s_load_dword [[LHS:s[0-9]+]] ; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] -define void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { +define amdgpu_kernel void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -86,7 +86,7 @@ ; GCN-LABEL: {{^}}ashr_imm_v_v2i16: ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], -4 -define void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -100,7 +100,7 @@ ; GCN-LABEL: {{^}}ashr_v_imm_v2i16: ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], 8, [[LHS]] -define void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -117,7 +117,7 @@ ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: {{buffer|flat}}_store_dwordx2 -define void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext @@ -135,7 +135,7 @@ ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GCN: {{buffer|flat}}_store_dwordx2 -define void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext Index: test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll +++ test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll @@ -12,7 +12,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 ; GCN: s_endpgm -define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind { +define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic %result = extractvalue { i32, i1 } %pair, 0 @@ -33,7 +33,7 @@ ; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 ; GCN: buffer_store_dwordx2 [[RESULT]], ; GCN: s_endpgm -define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind { +define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic %result = extractvalue { i64, i1 } %pair, 0 @@ -45,7 +45,7 @@ ; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind { %sub = sub i32 %a, %b %add = add i32 %sub, 4 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add @@ -65,7 +65,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] ; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 ; GCN: s_endpgm -define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind { +define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic %result = extractvalue { i32, i1 } %pair, 0 @@ -84,7 +84,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] ; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind { +define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic %result = extractvalue { i64, i1 } %pair, 0 Index: test/CodeGen/AMDGPU/atomic_load_add.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_load_add.ll +++ test/CodeGen/AMDGPU/atomic_load_add.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}atomic_add_local: ; R600: LDS_ADD * ; SI: ds_add_u32 -define void @atomic_add_local(i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) { %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst ret void } @@ -13,7 +13,7 @@ ; FUNC-LABEL: {{^}}atomic_add_local_const_offset: ; R600: LDS_ADD * ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %local) { %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst ret void @@ -22,7 +22,7 @@ ; FUNC-LABEL: {{^}}atomic_add_ret_local: ; R600: LDS_ADD_RET * ; SI: ds_add_rtn_u32 -define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst store i32 %val, i32 addrspace(1)* %out ret void @@ -31,7 +31,7 @@ ; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset: ; R600: LDS_ADD_RET * ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 -define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst store i32 %val, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/atomic_load_sub.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_load_sub.ll +++ test/CodeGen/AMDGPU/atomic_load_sub.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_local: ; R600: LDS_SUB * ; SI: ds_sub_u32 -define void @atomic_sub_local(i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) { %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst ret void } @@ -13,7 +13,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_local_const_offset: ; R600: LDS_SUB * ; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) { %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst ret void @@ -22,7 +22,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_ret_local: ; R600: LDS_SUB_RET * ; SI: ds_sub_rtn_u32 -define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst store i32 %val, i32 addrspace(1)* %out ret void @@ -31,7 +31,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset: ; R600: LDS_SUB_RET * ; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 -define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst store i32 %val, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll =================================================================== --- test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -5,7 +5,7 @@ ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @min_64_max_64() #0 { +define amdgpu_kernel void @min_64_max_64() #0 { entry: ret void } @@ -16,7 +16,7 @@ ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @min_64_max_128() #1 { +define amdgpu_kernel void @min_64_max_128() #1 { entry: ret void } @@ -27,7 +27,7 @@ ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @min_128_max_128() #2 { +define amdgpu_kernel void @min_128_max_128() #2 { entry: ret void } @@ -39,7 +39,7 @@ ; CHECK: NumSGPRsForWavesPerEU: 13 ; CHECK: NumVGPRsForWavesPerEU: 32 @var = addrspace(1) global float 0.0 -define void @min_1024_max_2048() #3 { +define amdgpu_kernel void @min_1024_max_2048() #3 { %val0 = load volatile float, float addrspace(1)* @var %val1 = load volatile float, float addrspace(1)* @var %val2 = load volatile float, float addrspace(1)* @var Index: test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll =================================================================== --- test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -8,7 +8,7 @@ ; ALL: SGPRBlocks: 1 ; ALL: NumSGPRsForWavesPerEU: 9 -define void @max_9_sgprs(i32 addrspace(1)* %out1, +define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1, i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, @@ -49,7 +49,7 @@ ; TOSMEM: SGPRBlocks: 1 ; TOSMEM: NumSGPRsForWavesPerEU: 16 -define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, +define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, @@ -90,7 +90,7 @@ ; XALL: SGPRBlocks: 2 ; XALL: NumSGPRsForWavesPerEU: 18 -;define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1, +;define amdgpu_kernel void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1, ; i32 addrspace(1)* %out2, ; i32 addrspace(1)* %out3, ; i32 addrspace(1)* %out4, Index: test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll =================================================================== --- test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll +++ test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: {{^}}max_20_vgprs: ; CHECK: VGPRBlocks: 4 ; CHECK: NumVGPRsForWavesPerEU: 20 -define void @max_20_vgprs() #1 { +define amdgpu_kernel void @max_20_vgprs() #1 { %val0 = load volatile float, float addrspace(1)* @var %val1 = load volatile float, float addrspace(1)* @var %val2 = load volatile float, float addrspace(1)* @var Index: test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll =================================================================== --- test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -6,7 +6,7 @@ ; CHECK: VGPRBlocks: 32 ; CHECK: NumSGPRsForWavesPerEU: 102 ; CHECK: NumVGPRsForWavesPerEU: 129 -define void @empty_exactly_1() #0 { +define amdgpu_kernel void @empty_exactly_1() #0 { entry: ret void } @@ -18,7 +18,7 @@ ; CHECK: VGPRBlocks: 10 ; CHECK: NumSGPRsForWavesPerEU: 102 ; CHECK: NumVGPRsForWavesPerEU: 41 -define void @empty_exactly_5() #1 { +define amdgpu_kernel void @empty_exactly_5() #1 { entry: ret void } @@ -30,7 +30,7 @@ ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @empty_exactly_10() #2 { +define amdgpu_kernel void @empty_exactly_10() #2 { entry: ret void } @@ -42,7 +42,7 @@ ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @empty_at_least_1() #3 { +define amdgpu_kernel void @empty_at_least_1() #3 { entry: ret void } @@ -54,7 +54,7 @@ ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @empty_at_least_5() #4 { +define amdgpu_kernel void @empty_at_least_5() #4 { entry: ret void } @@ -66,7 +66,7 @@ ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @empty_at_least_10() #5 { +define amdgpu_kernel void @empty_at_least_10() #5 { entry: ret void } @@ -80,7 +80,7 @@ ; CHECK: VGPRBlocks: 10 ; CHECK: NumSGPRsForWavesPerEU: 102 ; CHECK: NumVGPRsForWavesPerEU: 41 -define void @empty_at_most_5() #6 { +define amdgpu_kernel void @empty_at_most_5() #6 { entry: ret void } @@ -92,7 +92,7 @@ ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @empty_at_most_10() #7 { +define amdgpu_kernel void @empty_at_most_10() #7 { entry: ret void } @@ -106,7 +106,7 @@ ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @empty_between_5_and_10() #8 { +define amdgpu_kernel void @empty_between_5_and_10() #8 { entry: ret void } @@ -120,7 +120,7 @@ ; CHECK: VGPRBlocks: 5 ; CHECK: NumSGPRsForWavesPerEU: 13 ; CHECK: NumVGPRsForWavesPerEU: 24 -define void @exactly_10() #9 { +define amdgpu_kernel void @exactly_10() #9 { %val0 = load volatile float, float addrspace(1)* @var %val1 = load volatile float, float addrspace(1)* @var %val2 = load volatile float, float addrspace(1)* @var Index: test/CodeGen/AMDGPU/attr-unparseable.ll =================================================================== --- test/CodeGen/AMDGPU/attr-unparseable.ll +++ test/CodeGen/AMDGPU/attr-unparseable.ll @@ -1,56 +1,56 @@ ; RUN: not llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s 2>&1 | FileCheck %s ; CHECK: can't parse integer attribute amdgpu-num-sgpr -define void @unparseable_single_0() #0 { +define amdgpu_kernel void @unparseable_single_0() #0 { entry: ret void } attributes #0 = {"amdgpu-num-sgpr"} ; CHECK: can't parse integer attribute amdgpu-num-sgpr -define void @unparseable_single_1() #1 { +define amdgpu_kernel void @unparseable_single_1() #1 { entry: ret void } attributes #1 = {"amdgpu-num-sgpr"="k"} ; CHECK: can't parse integer attribute amdgpu-num-sgpr -define void @unparseable_single_2() #2 { +define amdgpu_kernel void @unparseable_single_2() #2 { entry: ret void } attributes #2 = {"amdgpu-num-sgpr"="1,2"} ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size -define void @unparseable_pair_0() #3 { +define amdgpu_kernel void @unparseable_pair_0() #3 { entry: ret void } attributes #3 = {"amdgpu-flat-work-group-size"} ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size -define void @unparseable_pair_1() #4 { +define amdgpu_kernel void @unparseable_pair_1() #4 { entry: ret void } attributes #4 = {"amdgpu-flat-work-group-size"="k"} ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size -define void @unparseable_pair_2() #5 { +define amdgpu_kernel void @unparseable_pair_2() #5 { entry: ret void } attributes #5 = {"amdgpu-flat-work-group-size"="1"} ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size -define void @unparseable_pair_3() #6 { +define amdgpu_kernel void @unparseable_pair_3() #6 { entry: ret void } attributes #6 = {"amdgpu-flat-work-group-size"="1,k"} ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size -define void @unparseable_pair_4() #7 { +define amdgpu_kernel void @unparseable_pair_4() #7 { entry: ret void } Index: test/CodeGen/AMDGPU/basic-branch.ll =================================================================== --- test/CodeGen/AMDGPU/basic-branch.ll +++ test/CodeGen/AMDGPU/basic-branch.ll @@ -15,7 +15,7 @@ ; GCN: {{^}}[[END]]: ; GCN: s_endpgm -define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 { +define amdgpu_kernel void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 { %cmp = icmp ne i32 %val, 0 br i1 %cmp, label %store, label %end @@ -39,7 +39,7 @@ ; GCN: {{^}}[[END]]: ; GCN: s_endpgm -define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 { +define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 { %cmp0 = icmp ne i1 %val, 0 br i1 %cmp0, label %store, label %end Index: test/CodeGen/AMDGPU/basic-loop.ll =================================================================== --- test/CodeGen/AMDGPU/basic-loop.ll +++ test/CodeGen/AMDGPU/basic-loop.ll @@ -2,7 +2,7 @@ ; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s ; CHECK-LABEL: {{^}}test_loop: -define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind { +define amdgpu_kernel void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind { entry: br label %loop.body Index: test/CodeGen/AMDGPU/bfe-patterns.ll =================================================================== --- test/CodeGen/AMDGPU/bfe-patterns.ll +++ test/CodeGen/AMDGPU/bfe-patterns.ll @@ -5,7 +5,7 @@ ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]] ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]] -define void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x @@ -32,7 +32,7 @@ ; GCN: [[BFE]] ; GCN: [[SHL]] -define void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x @@ -52,7 +52,7 @@ ; GCN: s_load_dword [[WIDTH:s[0-9]+]] ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]] ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] -define void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { +define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x %sub = sub i32 32, %width @@ -68,7 +68,7 @@ ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]] ; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]] ; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]] -define void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { +define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x %sub = sub i32 32, %width @@ -83,7 +83,7 @@ ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]] ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]] -define void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x @@ -110,7 +110,7 @@ ; GCN: [[BFE]] ; GCN: [[SHL]] -define void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x @@ -130,7 +130,7 @@ ; GCN: s_load_dword [[WIDTH:s[0-9]+]] ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]] ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] -define void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { +define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x %sub = sub i32 32, %width @@ -146,7 +146,7 @@ ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]] ; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]] ; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]] -define void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { +define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x %sub = sub i32 32, %width Index: test/CodeGen/AMDGPU/bfe_uint.ll =================================================================== --- test/CodeGen/AMDGPU/bfe_uint.ll +++ test/CodeGen/AMDGPU/bfe_uint.ll @@ -2,7 +2,7 @@ ; CHECK: {{^}}bfe_def: ; CHECK: BFE_UINT -define void @bfe_def(i32 addrspace(1)* %out, i32 %x) { +define amdgpu_kernel void @bfe_def(i32 addrspace(1)* %out, i32 %x) { entry: %0 = lshr i32 %x, 5 %1 = and i32 %0, 15 ; 0xf @@ -17,7 +17,7 @@ ; CHECK: {{^}}bfe_shift: ; CHECK-NOT: BFE_UINT -define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) { +define amdgpu_kernel void @bfe_shift(i32 addrspace(1)* %out, i32 %x) { entry: %0 = lshr i32 %x, 16 %1 = and i32 %0, 65535 ; 0xffff Index: test/CodeGen/AMDGPU/bfi_int.ll =================================================================== --- test/CodeGen/AMDGPU/bfi_int.ll +++ test/CodeGen/AMDGPU/bfi_int.ll @@ -9,7 +9,7 @@ ; R600: BFI_INT ; SI: @bfi_def ; SI: v_bfi_b32 -define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { +define amdgpu_kernel void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { entry: %0 = xor i32 %x, -1 %1 = and i32 %z, %0 @@ -25,7 +25,7 @@ ; R600: BFI_INT ; SI: @bfi_sha256_ch ; SI: v_bfi_b32 -define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { +define amdgpu_kernel void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { entry: %0 = xor i32 %y, %z %1 = and i32 %x, %0 @@ -42,7 +42,7 @@ ; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}} ; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}} -define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { +define amdgpu_kernel void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { entry: %0 = and i32 %x, %z %1 = or i32 %x, %z Index: test/CodeGen/AMDGPU/bfm.ll =================================================================== --- test/CodeGen/AMDGPU/bfm.ll +++ test/CodeGen/AMDGPU/bfm.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}bfm_pattern: ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { %a = shl i32 1, %x %b = sub i32 %a, 1 %c = shl i32 %b, %y @@ -14,7 +14,7 @@ ; FUNC-LABEL: {{^}}bfm_pattern_simple: ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0 -define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 { +define amdgpu_kernel void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 { %a = shl i32 1, %x %b = sub i32 %a, 1 store i32 %b, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/bitcast-vector-extract.ll =================================================================== --- test/CodeGen/AMDGPU/bitcast-vector-extract.ll +++ test/CodeGen/AMDGPU/bitcast-vector-extract.ll @@ -11,7 +11,7 @@ ; GCN: buffer_store_dwordx4 ; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 -define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) { +define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) { %vec0.bc = bitcast <8 x i32> to <8 x float> store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out @@ -27,7 +27,7 @@ ; GCN: buffer_store_dwordx4 ; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 -define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) { +define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) { %vec0.bc = bitcast <4 x i64> to <8 x float> store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out @@ -43,7 +43,7 @@ ; GCN: buffer_store_dwordx4 ; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 -define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) { +define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) { %vec0.bc = bitcast <4 x i64> to <4 x double> store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out @@ -59,7 +59,7 @@ ; GCN: buffer_store_dwordx4 ; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 -define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) { +define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) { %vec0.bc = bitcast <16 x i16> to <8 x float> store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out @@ -70,7 +70,7 @@ ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source: ; GCN-NOT: store_dword -define void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 { +define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 { %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1 %bc = bitcast i64 %undef to <2 x i32> store volatile <2 x i32> %bc, <2 x i32> addrspace(1)* %out @@ -79,7 +79,7 @@ ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt: ; GCN-NOT: store_dword -define void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 { +define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 { %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1 %bc = bitcast i64 %undef to <2 x i32> %elt1 = extractelement <2 x i32> %bc, i32 1 Index: test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll =================================================================== --- test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll +++ test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll @@ -7,7 +7,7 @@ ; GCN-LABEL: {{^}}materialize_0_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_0_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_0_i32(i32 addrspace(1)* %out) { store i32 0, i32 addrspace(1)* %out ret void } @@ -16,7 +16,7 @@ ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_0_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_0_i64(i64 addrspace(1)* %out) { store i64 0, i64 addrspace(1)* %out ret void } @@ -24,7 +24,7 @@ ; GCN-LABEL: {{^}}materialize_neg1_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -1{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_neg1_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_neg1_i32(i32 addrspace(1)* %out) { store i32 -1, i32 addrspace(1)* %out ret void } @@ -33,7 +33,7 @@ ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}} ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_neg1_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_neg1_i64(i64 addrspace(1)* %out) { store i64 -1, i64 addrspace(1)* %out ret void } @@ -41,7 +41,7 @@ ; GCN-LABEL: {{^}}materialize_signbit_i32: ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_signbit_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_signbit_i32(i32 addrspace(1)* %out) { store i32 -2147483648, i32 addrspace(1)* %out ret void } @@ -50,7 +50,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 1{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_signbit_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_signbit_i64(i64 addrspace(1)* %out) { store i64 -9223372036854775808, i64 addrspace(1)* %out ret void } @@ -58,7 +58,7 @@ ; GCN-LABEL: {{^}}materialize_rev_neg16_i32: ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], -16{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) { store i32 268435455, i32 addrspace(1)* %out ret void } @@ -67,7 +67,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], -16{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) { store i64 1152921504606846975, i64 addrspace(1)* %out ret void } @@ -75,7 +75,7 @@ ; GCN-LABEL: {{^}}materialize_rev_neg17_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xf7ffffff{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) { store i32 -134217729, i32 addrspace(1)* %out ret void } @@ -84,7 +84,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0xf7ffffff{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) { store i64 -576460752303423489, i64 addrspace(1)* %out ret void } @@ -92,7 +92,7 @@ ; GCN-LABEL: {{^}}materialize_rev_64_i32: ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 64{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_rev_64_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_64_i32(i32 addrspace(1)* %out) { store i32 33554432, i32 addrspace(1)* %out ret void } @@ -101,7 +101,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 64{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_rev_64_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_64_i64(i64 addrspace(1)* %out) { store i64 144115188075855872, i64 addrspace(1)* %out ret void } @@ -109,7 +109,7 @@ ; GCN-LABEL: {{^}}materialize_rev_65_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x82000000{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_rev_65_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_65_i32(i32 addrspace(1)* %out) { store i32 -2113929216, i32 addrspace(1)* %out ret void } @@ -118,7 +118,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0x82000000{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_rev_65_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_65_i64(i64 addrspace(1)* %out) { store i64 -9079256848778919936, i64 addrspace(1)* %out ret void } @@ -126,7 +126,7 @@ ; GCN-LABEL: {{^}}materialize_rev_3_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -2.0{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_rev_3_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_3_i32(i32 addrspace(1)* %out) { store i32 -1073741824, i32 addrspace(1)* %out ret void } @@ -135,7 +135,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], -2.0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_rev_3_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_3_i64(i64 addrspace(1)* %out) { store i64 -4611686018427387904, i64 addrspace(1)* %out ret void } @@ -143,7 +143,7 @@ ; GCN-LABEL: {{^}}materialize_rev_1.0_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1fc{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) { store i32 508, i32 addrspace(1)* %out ret void } @@ -152,70 +152,70 @@ ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0x1fc{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) { store i64 508, i64 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}s_materialize_0_i32: ; GCN: s_mov_b32 s{{[0-9]+}}, 0{{$}} -define void @s_materialize_0_i32() { +define amdgpu_kernel void @s_materialize_0_i32() { call void asm sideeffect "; use $0", "s"(i32 0) ret void } ; GCN-LABEL: {{^}}s_materialize_1_i32: ; GCN: s_mov_b32 s{{[0-9]+}}, 1{{$}} -define void @s_materialize_1_i32() { +define amdgpu_kernel void @s_materialize_1_i32() { call void asm sideeffect "; use $0", "s"(i32 1) ret void } ; GCN-LABEL: {{^}}s_materialize_neg1_i32: ; GCN: s_mov_b32 s{{[0-9]+}}, -1{{$}} -define void @s_materialize_neg1_i32() { +define amdgpu_kernel void @s_materialize_neg1_i32() { call void asm sideeffect "; use $0", "s"(i32 -1) ret void } ; GCN-LABEL: {{^}}s_materialize_signbit_i32: ; GCN: s_brev_b32 s{{[0-9]+}}, 1{{$}} -define void @s_materialize_signbit_i32() { +define amdgpu_kernel void @s_materialize_signbit_i32() { call void asm sideeffect "; use $0", "s"(i32 -2147483648) ret void } ; GCN-LABEL: {{^}}s_materialize_rev_64_i32: ; GCN: s_brev_b32 s{{[0-9]+}}, 64{{$}} -define void @s_materialize_rev_64_i32() { +define amdgpu_kernel void @s_materialize_rev_64_i32() { call void asm sideeffect "; use $0", "s"(i32 33554432) ret void } ; GCN-LABEL: {{^}}s_materialize_rev_65_i32: ; GCN: s_mov_b32 s{{[0-9]+}}, 0x82000000{{$}} -define void @s_materialize_rev_65_i32() { +define amdgpu_kernel void @s_materialize_rev_65_i32() { call void asm sideeffect "; use $0", "s"(i32 -2113929216) ret void } ; GCN-LABEL: {{^}}s_materialize_rev_neg16_i32: ; GCN: s_brev_b32 s{{[0-9]+}}, -16{{$}} -define void @s_materialize_rev_neg16_i32() { +define amdgpu_kernel void @s_materialize_rev_neg16_i32() { call void asm sideeffect "; use $0", "s"(i32 268435455) ret void } ; GCN-LABEL: {{^}}s_materialize_rev_neg17_i32: ; GCN: s_mov_b32 s{{[0-9]+}}, 0xf7ffffff{{$}} -define void @s_materialize_rev_neg17_i32() { +define amdgpu_kernel void @s_materialize_rev_neg17_i32() { call void asm sideeffect "; use $0", "s"(i32 -134217729) ret void } ; GCN-LABEL: {{^}}s_materialize_rev_1.0_i32: ; GCN: s_movk_i32 s{{[0-9]+}}, 0x1fc{{$}} -define void @s_materialize_rev_1.0_i32() { +define amdgpu_kernel void @s_materialize_rev_1.0_i32() { call void asm sideeffect "; use $0", "s"(i32 508) ret void } Index: test/CodeGen/AMDGPU/bitreverse.ll =================================================================== --- test/CodeGen/AMDGPU/bitreverse.ll +++ test/CodeGen/AMDGPU/bitreverse.ll @@ -14,7 +14,7 @@ ; FUNC-LABEL: {{^}}s_brev_i16: ; SI: s_brev_b32 -define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { +define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 store i16 %brev, i16 addrspace(1)* %out ret void @@ -22,7 +22,7 @@ ; FUNC-LABEL: {{^}}v_brev_i16: ; SI: v_bfrev_b32_e32 -define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 { +define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 { %val = load i16, i16 addrspace(1)* %valptr %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 store i16 %brev, i16 addrspace(1)* %out @@ -35,7 +35,7 @@ ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; SI: buffer_store_dword [[VRESULT]], ; SI: s_endpgm -define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 { +define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 { %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 store i32 %brev, i32 addrspace(1)* %out ret void @@ -46,7 +46,7 @@ ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { +define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { %val = load i32, i32 addrspace(1)* %valptr %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 store i32 %brev, i32 addrspace(1)* %out @@ -56,7 +56,7 @@ ; FUNC-LABEL: {{^}}s_brev_v2i32: ; SI: s_brev_b32 ; SI: s_brev_b32 -define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 { +define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 { %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out ret void @@ -65,7 +65,7 @@ ; FUNC-LABEL: {{^}}v_brev_v2i32: ; SI: v_bfrev_b32_e32 ; SI: v_bfrev_b32_e32 -define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { +define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out @@ -73,7 +73,7 @@ } ; FUNC-LABEL: {{^}}s_brev_i64: -define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { +define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 store i64 %brev, i64 addrspace(1)* %out ret void @@ -81,7 +81,7 @@ ; FUNC-LABEL: {{^}}v_brev_i64: ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0 -define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { +define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { %val = load i64, i64 addrspace(1)* %valptr %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 store i64 %brev, i64 addrspace(1)* %out @@ -89,14 +89,14 @@ } ; FUNC-LABEL: {{^}}s_brev_v2i64: -define void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 { +define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 { %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}v_brev_v2i64: -define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { +define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out Index: test/CodeGen/AMDGPU/br_cc.f16.ll =================================================================== --- test/CodeGen/AMDGPU/br_cc.f16.ll +++ test/CodeGen/AMDGPU/br_cc.f16.ll @@ -20,7 +20,7 @@ ; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[B_F16]] ; GCN: s_endpgm -define void @br_cc_f16( +define amdgpu_kernel void @br_cc_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -59,7 +59,7 @@ ; GCN: two{{$}} ; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]] -define void @br_cc_f16_imm_a( +define amdgpu_kernel void @br_cc_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -92,7 +92,7 @@ ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}} ; GCN: buffer_store_short v[[B_F16]] ; GCN: s_endpgm -define void @br_cc_f16_imm_b( +define amdgpu_kernel void @br_cc_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/branch-relax-spill.ll =================================================================== --- test/CodeGen/AMDGPU/branch-relax-spill.ll +++ test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -5,7 +5,7 @@ ; FAIL: LLVM ERROR: Error while trying to spill VCC from class SReg_64: Cannot scavenge register without an emergency spill slot! -define void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 { +define amdgpu_kernel void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 { entry: %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={SGPR0}"() #0 %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={SGPR1}"() #0 Index: test/CodeGen/AMDGPU/branch-relaxation.ll =================================================================== --- test/CodeGen/AMDGPU/branch-relaxation.ll +++ test/CodeGen/AMDGPU/branch-relaxation.ll @@ -26,7 +26,7 @@ ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] ; GCN: buffer_store_dword [[V_CND]] ; GCN: s_endpgm -define void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 { +define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 { bb: %cmp = icmp eq i32 %cnd, 0 br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch @@ -68,7 +68,7 @@ ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] ; GCN: buffer_store_dword [[V_CND]] ; GCN: s_endpgm -define void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 { +define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 { bb0: %cmp = icmp eq i32 %cnd, 0 br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch @@ -108,7 +108,7 @@ ; GCN: [[ENDBB]]: ; GCN: buffer_store_dword [[V_CND]] ; GCN: s_endpgm -define void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 { +define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 { bb0: %cmp = fcmp oeq float %cnd, 0.0 br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch @@ -141,7 +141,7 @@ ; GCN: s_or_b64 exec, exec, [[SAVE]] ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 { +define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = zext i32 %tid to i64 @@ -188,7 +188,7 @@ ; GCN-NEXT: [[ENDBB]]: ; GCN-NEXT: s_endpgm -define void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 { +define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 { bb: br label %bb2 @@ -243,7 +243,7 @@ ; GCN: buffer_store_dword [[BB4_K]] ; GCN-NEXT: s_endpgm ; GCN-NEXT: .Lfunc_end{{[0-9]+}}: -define void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) { +define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) { bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 @@ -285,7 +285,7 @@ ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}} ; GCN-NEXT: s_setpc_b64 vcc ; GCN-NEXT .Lfunc_end{{[0-9]+}}: -define void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) { +define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) { entry: br label %loop @@ -342,7 +342,7 @@ ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_endpgm -define void @expand_requires_expand(i32 %cond0) #0 { +define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 { bb0: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %cmp0 = icmp slt i32 %cond0, 0 @@ -399,7 +399,7 @@ ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] ; GCN-NEXT: s_sleep 5 ; GCN-NEXT: s_endpgm -define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 { +define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %d_cmp = icmp ult i32 %tid, 16 @@ -462,7 +462,7 @@ ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] ; GCN: buffer_store_dword ; GCN-NEXT: s_endpgm -define void @analyze_mask_branch() #0 { +define amdgpu_kernel void @analyze_mask_branch() #0 { entry: %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"() %cmp0 = fcmp ogt float %reg, 0.000000e+00 Index: test/CodeGen/AMDGPU/bswap.ll =================================================================== --- test/CodeGen/AMDGPU/bswap.ll +++ test/CodeGen/AMDGPU/bswap.ll @@ -17,7 +17,7 @@ ; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone store i32 %bswap, i32 addrspace(1)* %out, align 4 @@ -32,7 +32,7 @@ ; SI-DAG: v_alignbit_b32 ; SI-DAG: v_bfi_b32 ; SI: s_endpgm -define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8 @@ -53,7 +53,7 @@ ; SI-DAG: v_alignbit_b32 ; SI-DAG: v_bfi_b32 ; SI: s_endpgm -define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind { %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16 @@ -86,7 +86,7 @@ ; SI-DAG: v_alignbit_b32 ; SI-DAG: v_bfi_b32 ; SI: s_endpgm -define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind { %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32 @@ -95,21 +95,21 @@ ; FUNC-LABEL: {{^}}test_bswap_i64: ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0 -define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { %val = load i64, i64 addrspace(1)* %in, align 8 %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone store i64 %bswap, i64 addrspace(1)* %out, align 8 ret void } -define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16 ret void } -define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind { %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32 Index: test/CodeGen/AMDGPU/build_vector.ll =================================================================== --- test/CodeGen/AMDGPU/build_vector.ll +++ test/CodeGen/AMDGPU/build_vector.ll @@ -10,7 +10,7 @@ ; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5 ; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6 ; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}} -define void @build_vector2 (<2 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @build_vector2 (<2 x i32> addrspace(1)* %out) { entry: store <2 x i32> , <2 x i32> addrspace(1)* %out ret void @@ -28,7 +28,7 @@ ; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7 ; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8 ; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}} -define void @build_vector4 (<4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @build_vector4 (<4 x i32> addrspace(1)* %out) { entry: store <4 x i32> , <4 x i32> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/call.ll =================================================================== --- test/CodeGen/AMDGPU/call.ll +++ test/CodeGen/AMDGPU/call.ll @@ -10,7 +10,7 @@ declare i32 @external_function(i32) nounwind -define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -25,7 +25,7 @@ ret i32 %y } -define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -35,7 +35,7 @@ ret void } -define void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr Index: test/CodeGen/AMDGPU/captured-frame-index.ll =================================================================== --- test/CodeGen/AMDGPU/captured-frame-index.ll +++ test/CodeGen/AMDGPU/captured-frame-index.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}store_fi_lifetime: ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI]] -define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 { entry: %b = alloca i8 call void @llvm.lifetime.start(i64 1, i8* %b) @@ -18,7 +18,7 @@ ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}} ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]] -define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 { %tmp = alloca float store float 4.0, float *%tmp store float* %tmp, float* addrspace(3)* %ptr @@ -38,7 +38,7 @@ ; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} ; GCN: ds_write_b32 [[VLDSPTR]], [[FI1]] -define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 { %tmp0 = alloca float %tmp1 = alloca float store float 4.0, float* %tmp0 @@ -54,7 +54,7 @@ ; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} -define void @stored_fi_to_self() #0 { +define amdgpu_kernel void @stored_fi_to_self() #0 { %tmp = alloca i32* ; Avoid optimizing everything out @@ -73,7 +73,7 @@ ; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}} ; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}} -define void @stored_fi_to_self_offset() #0 { +define amdgpu_kernel void @stored_fi_to_self_offset() #0 { %tmp0 = alloca [512 x i32] %tmp1 = alloca i32* @@ -98,7 +98,7 @@ ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} -define void @stored_fi_to_fi() #0 { +define amdgpu_kernel void @stored_fi_to_fi() #0 { %tmp0 = alloca i32* %tmp1 = alloca i32* %tmp2 = alloca i32* @@ -118,7 +118,7 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI]] -define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 { %tmp = alloca float store float 0.0, float *%tmp store float* %tmp, float* addrspace(1)* %ptr @@ -136,7 +136,7 @@ ; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 { %tmp0 = alloca float %tmp1 = alloca float %tmp2 = alloca float @@ -163,7 +163,7 @@ ; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 { %tmp0 = alloca [4096 x i32] %tmp1 = alloca [4096 x i32] %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0 @@ -186,7 +186,7 @@ ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1@gotpcrel32@hi+4 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI]] -define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 { +define amdgpu_kernel void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 { entry: %b = alloca i32, align 4 %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4 Index: test/CodeGen/AMDGPU/cf-loop-on-constant.ll =================================================================== --- test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -7,7 +7,7 @@ ; GCN: ds_write_b32 ; GCN: s_branch [[LABEL]] ; GCN: s_endpgm -define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind { +define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind { entry: %cmp = icmp eq i32 %n, -1 br i1 %cmp, label %for.exit, label %for.body @@ -31,7 +31,7 @@ ; GCN: ds_read_b32 ; GCN: ds_write_b32 ; GCN: s_branch [[LABEL]] -define void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind { +define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind { entry: br label %for.body @@ -52,7 +52,7 @@ ; GCN-LABEL: {{^}}loop_const_false: ; GCN-NOT: s_branch ; GCN: s_endpgm -define void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind { +define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind { entry: br label %for.body @@ -74,7 +74,7 @@ ; GCN-LABEL: {{^}}loop_const_undef: ; GCN-NOT: s_branch ; GCN: s_endpgm -define void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind { +define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind { entry: br label %for.body @@ -104,7 +104,7 @@ ; GCN: s_cbranch_vccnz [[LOOPBB]] ; GCN-NEXT: ; BB#2 ; GCN-NEXT: s_endpgm -define void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind { +define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind { entry: br label %for.body Index: test/CodeGen/AMDGPU/cf-stack-bug.ll =================================================================== --- test/CodeGen/AMDGPU/cf-stack-bug.ll +++ test/CodeGen/AMDGPU/cf-stack-bug.ll @@ -35,7 +35,7 @@ ; BUG32-NOT: Applying bug work-around ; NOBUG-NOT: Applying bug work-around ; FUNC-LABEL: {{^}}nested3: -define void @nested3(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @nested3(i32 addrspace(1)* %out, i32 %cond) { entry: %0 = icmp sgt i32 %cond, 0 br i1 %0, label %if.1, label %end @@ -68,7 +68,7 @@ ; BUG32-NOT: Applying bug work-around ; NOBUG-NOT: Applying bug work-around ; FUNC-LABEL: {{^}}nested4: -define void @nested4(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @nested4(i32 addrspace(1)* %out, i32 %cond) { entry: %0 = icmp sgt i32 %cond, 0 br i1 %0, label %if.1, label %end @@ -109,7 +109,7 @@ ; BUG32-NOT: Applying bug work-around ; NOBUG-NOT: Applying bug work-around ; FUNC-LABEL: {{^}}nested7: -define void @nested7(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @nested7(i32 addrspace(1)* %out, i32 %cond) { entry: %0 = icmp sgt i32 %cond, 0 br i1 %0, label %if.1, label %end @@ -174,7 +174,7 @@ ; BUG32: Applying bug work-around ; NOBUG-NOT: Applying bug work-around ; FUNC-LABEL: {{^}}nested8: -define void @nested8(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @nested8(i32 addrspace(1)* %out, i32 %cond) { entry: %0 = icmp sgt i32 %cond, 0 br i1 %0, label %if.1, label %end Index: test/CodeGen/AMDGPU/cf_end.ll =================================================================== --- test/CodeGen/AMDGPU/cf_end.ll +++ test/CodeGen/AMDGPU/cf_end.ll @@ -4,6 +4,6 @@ ; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80] ; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88] -define void @eop() { +define amdgpu_kernel void @eop() { ret void } Index: test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -11,7 +11,7 @@ ; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32: ; GCN: flat_load_dword ; GCN: {{^}}BB0_2: -define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +define amdgpu_kernel void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { entry: %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 @@ -43,7 +43,7 @@ ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32: ; CI: buffer_load_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 -define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { entry: %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 @@ -76,7 +76,7 @@ ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32: ; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { entry: %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -15,7 +15,7 @@ ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32: ; GCN: {{^}}BB0_2: -define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7 @@ -45,7 +45,7 @@ ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} ; GCN: {{^}}BB1_2: ; GCN: s_or_b64 exec -define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 @@ -72,7 +72,7 @@ ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} ; GCN: {{^}}BB2_2: ; GCN: s_or_b64 exec -define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095 @@ -99,7 +99,7 @@ ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} ; GCN: {{^}}BB3_2: ; GCN: s_or_b64 exec -define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096 @@ -131,7 +131,7 @@ ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}} ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}} ; GCN: {{^}}BB4_2: -define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { +define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: %alloca = alloca [512 x i32], align 4 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 @@ -172,7 +172,7 @@ ; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} ; GCN: {{^BB[0-9]+}}_2: -define void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { +define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: %alloca = alloca [512 x i32], align 4 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 @@ -209,7 +209,7 @@ ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} ; GCN: {{^BB[0-9]+}}_2: -define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { +define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: %alloca = alloca [512 x i32], align 4 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 @@ -241,7 +241,7 @@ ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GCN: {{^BB[0-9]+}}_2: -define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { +define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { entry: %offset.ext = zext i32 %offset to i64 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 @@ -271,7 +271,7 @@ ; GCN: s_and_saveexec_b64 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7 @@ -300,7 +300,7 @@ ; GCN: s_and_saveexec_b64 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255 @@ -333,7 +333,7 @@ ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256 @@ -365,7 +365,7 @@ ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}} ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295 @@ -396,7 +396,7 @@ ; GCN: s_addc_u32 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181 @@ -426,7 +426,7 @@ ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143 @@ -464,7 +464,7 @@ ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144 @@ -494,7 +494,7 @@ ; GCN: s_load_dword [[SREG1:s[0-9]+]], ; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]] ; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5 -define void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind { +define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind { entry: %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2 @@ -521,7 +521,7 @@ ; OPT: if: ; OPT: %sunkaddr = ptrtoint i8 addrspace(2)* %in to i64 ; OPT: %sunkaddr1 = add i64 %sunkaddr, 4095 -define void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 %in.gep = getelementptr i8, i8 addrspace(2)* %in, i64 4095 @@ -548,7 +548,7 @@ ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)* ; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %sunkaddr2, i32 2 seq_cst -define void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { +define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 @@ -574,7 +574,7 @@ ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)* ; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %sunkaddr2, i32 undef, i32 2 seq_cst monotonic -define void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { +define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 @@ -600,7 +600,7 @@ ; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 ; OPT: br i1 ; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic -define void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) { +define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) { entry: %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 @@ -627,7 +627,7 @@ ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)* ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %sunkaddr2, i32 2) -define void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { +define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 @@ -653,7 +653,7 @@ ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)* ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %sunkaddr2, i32 2) -define void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { +define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 Index: test/CodeGen/AMDGPU/cgp-bitfield-extract.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -36,7 +36,7 @@ ; GCN: BB0_3: ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 { +define amdgpu_kernel void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 { entry: %shr = lshr i32 %arg1, 8 br i1 undef, label %bb0, label %bb1 @@ -76,7 +76,7 @@ ; OPT: ret ; GCN-LABEL: {{^}}sink_sbfe_i32: -define void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 { +define amdgpu_kernel void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 { entry: %shr = ashr i32 %arg1, 8 br i1 undef, label %bb0, label %bb1 @@ -134,7 +134,7 @@ ; GCN: BB2_3: ; GCN: buffer_store_short ; GCN: s_endpgm -define void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 { +define amdgpu_kernel void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 { entry: %shr = lshr i16 %arg1, 4 br i1 undef, label %bb0, label %bb1 @@ -187,7 +187,7 @@ ; GCN: BB3_3: ; GCN: buffer_store_dwordx2 -define void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 { +define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 { entry: %shr = lshr i64 %arg1, 30 br i1 undef, label %bb0, label %bb1 @@ -236,7 +236,7 @@ ; GCN: BB4_3: ; GCN: buffer_store_dwordx2 -define void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 { +define amdgpu_kernel void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 { entry: %shr = lshr i64 %arg1, 15 br i1 undef, label %bb0, label %bb1 @@ -283,7 +283,7 @@ ; GCN: BB5_3: ; GCN: buffer_store_dwordx2 -define void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 { +define amdgpu_kernel void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 { entry: %shr = lshr i64 %arg1, 35 br i1 undef, label %bb0, label %bb1 Index: test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll =================================================================== --- test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll +++ test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll @@ -8,7 +8,7 @@ ; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}} -define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) { +define amdgpu_kernel void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) { bb0: %tmp = icmp sgt i32 %arg1, 4 %c = icmp eq i32 %arg3, 0 @@ -35,7 +35,7 @@ ; GCN-NOT: vcc ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}} -define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) { +define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) { bb0: %tmp = icmp sgt i32 %arg1, 4 %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef) Index: test/CodeGen/AMDGPU/coalescer_remat.ll =================================================================== --- test/CodeGen/AMDGPU/coalescer_remat.ll +++ test/CodeGen/AMDGPU/coalescer_remat.ll @@ -13,7 +13,7 @@ ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; It's probably OK if this is slightly higher: ; CHECK: ; NumVgprs: 8 -define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { +define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { entry: %cmpflag = icmp eq i32 %flag, 1 br i1 %cmpflag, label %loop, label %exit Index: test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll =================================================================== --- test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll +++ test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll @@ -8,7 +8,7 @@ ; SI-LLC-LABEL: {{^}}test: ; SI-LLC: s_mul_i32 ; SI-LLC-NOT: mul -define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) { +define amdgpu_kernel void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) { entry: %0 = mul nsw i32 %a, 3 %1 = sext i32 %0 to i64 Index: test/CodeGen/AMDGPU/combine_vloads.ll =================================================================== --- test/CodeGen/AMDGPU/combine_vloads.ll +++ test/CodeGen/AMDGPU/combine_vloads.ll @@ -12,7 +12,7 @@ ; EG-LABEL: {{^}}combine_vloads: ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind { +define amdgpu_kernel void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind { entry: br label %for.body Index: test/CodeGen/AMDGPU/commute-compares.ll =================================================================== --- test/CodeGen/AMDGPU/commute-compares.ll +++ test/CodeGen/AMDGPU/commute-compares.ll @@ -8,7 +8,7 @@ ; GCN-LABEL: {{^}}commute_eq_64_i32: ; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -21,7 +21,7 @@ ; GCN-LABEL: {{^}}commute_ne_64_i32: ; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -36,7 +36,7 @@ ; GCN-LABEL: {{^}}commute_ne_litk_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039 ; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}} -define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -49,7 +49,7 @@ ; GCN-LABEL: {{^}}commute_ugt_64_i32: ; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -62,7 +62,7 @@ ; GCN-LABEL: {{^}}commute_uge_64_i32: ; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}} -define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -75,7 +75,7 @@ ; GCN-LABEL: {{^}}commute_ult_64_i32: ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -88,7 +88,7 @@ ; GCN-LABEL: {{^}}commute_ule_63_i32: ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -104,7 +104,7 @@ ; GCN-LABEL: {{^}}commute_ule_64_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}} ; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}} -define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -117,7 +117,7 @@ ; GCN-LABEL: {{^}}commute_sgt_neg1_i32: ; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}} -define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -130,7 +130,7 @@ ; GCN-LABEL: {{^}}commute_sge_neg2_i32: ; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}} -define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -143,7 +143,7 @@ ; GCN-LABEL: {{^}}commute_slt_neg16_i32: ; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}} -define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -156,7 +156,7 @@ ; GCN-LABEL: {{^}}commute_sle_5_i32: ; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}} -define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -173,7 +173,7 @@ ; GCN-LABEL: {{^}}commute_eq_64_i64: ; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -186,7 +186,7 @@ ; GCN-LABEL: {{^}}commute_ne_64_i64: ; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -199,7 +199,7 @@ ; GCN-LABEL: {{^}}commute_ugt_64_i64: ; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -212,7 +212,7 @@ ; GCN-LABEL: {{^}}commute_uge_64_i64: ; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -225,7 +225,7 @@ ; GCN-LABEL: {{^}}commute_ult_64_i64: ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -238,7 +238,7 @@ ; GCN-LABEL: {{^}}commute_ule_63_i64: ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -254,7 +254,7 @@ ; GCN-LABEL: {{^}}commute_ule_64_i64: ; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}} ; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -267,7 +267,7 @@ ; GCN-LABEL: {{^}}commute_sgt_neg1_i64: ; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -280,7 +280,7 @@ ; GCN-LABEL: {{^}}commute_sge_neg2_i64: ; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -293,7 +293,7 @@ ; GCN-LABEL: {{^}}commute_slt_neg16_i64: ; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -306,7 +306,7 @@ ; GCN-LABEL: {{^}}commute_sle_5_i64: ; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -324,7 +324,7 @@ ; GCN-LABEL: {{^}}commute_oeq_2.0_f32: ; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -338,7 +338,7 @@ ; GCN-LABEL: {{^}}commute_ogt_2.0_f32: ; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -351,7 +351,7 @@ ; GCN-LABEL: {{^}}commute_oge_2.0_f32: ; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -364,7 +364,7 @@ ; GCN-LABEL: {{^}}commute_olt_2.0_f32: ; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -377,7 +377,7 @@ ; GCN-LABEL: {{^}}commute_ole_2.0_f32: ; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -390,7 +390,7 @@ ; GCN-LABEL: {{^}}commute_one_2.0_f32: ; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -403,7 +403,7 @@ ; GCN-LABEL: {{^}}commute_ord_2.0_f32: ; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] -define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -416,7 +416,7 @@ ; GCN-LABEL: {{^}}commute_ueq_2.0_f32: ; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -429,7 +429,7 @@ ; GCN-LABEL: {{^}}commute_ugt_2.0_f32: ; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -442,7 +442,7 @@ ; GCN-LABEL: {{^}}commute_uge_2.0_f32: ; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -455,7 +455,7 @@ ; GCN-LABEL: {{^}}commute_ult_2.0_f32: ; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -468,7 +468,7 @@ ; GCN-LABEL: {{^}}commute_ule_2.0_f32: ; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -481,7 +481,7 @@ ; GCN-LABEL: {{^}}commute_une_2.0_f32: ; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -494,7 +494,7 @@ ; GCN-LABEL: {{^}}commute_uno_2.0_f32: ; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] -define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -512,7 +512,7 @@ ; GCN-LABEL: {{^}}commute_oeq_2.0_f64: ; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -526,7 +526,7 @@ ; GCN-LABEL: {{^}}commute_ogt_2.0_f64: ; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -539,7 +539,7 @@ ; GCN-LABEL: {{^}}commute_oge_2.0_f64: ; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -552,7 +552,7 @@ ; GCN-LABEL: {{^}}commute_olt_2.0_f64: ; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -565,7 +565,7 @@ ; GCN-LABEL: {{^}}commute_ole_2.0_f64: ; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -578,7 +578,7 @@ ; GCN-LABEL: {{^}}commute_one_2.0_f64: ; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -591,7 +591,7 @@ ; GCN-LABEL: {{^}}commute_ord_2.0_f64: ; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] -define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -604,7 +604,7 @@ ; GCN-LABEL: {{^}}commute_ueq_2.0_f64: ; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -617,7 +617,7 @@ ; GCN-LABEL: {{^}}commute_ugt_2.0_f64: ; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -630,7 +630,7 @@ ; GCN-LABEL: {{^}}commute_uge_2.0_f64: ; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -643,7 +643,7 @@ ; GCN-LABEL: {{^}}commute_ult_2.0_f64: ; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -656,7 +656,7 @@ ; GCN-LABEL: {{^}}commute_ule_2.0_f64: ; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -669,7 +669,7 @@ ; GCN-LABEL: {{^}}commute_une_2.0_f64: ; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -682,7 +682,7 @@ ; GCN-LABEL: {{^}}commute_uno_2.0_f64: ; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] -define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -703,7 +703,7 @@ ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}} -define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 { +define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 { entry: %stack0 = alloca i32 %ptr0 = load volatile i32*, i32* addrspace(1)* undef Index: test/CodeGen/AMDGPU/commute_modifiers.ll =================================================================== --- test/CodeGen/AMDGPU/commute_modifiers.ll +++ test/CodeGen/AMDGPU/commute_modifiers.ll @@ -8,7 +8,7 @@ ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, 2.0 ; SI: buffer_store_dword [[REG]] -define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %x = load float, float addrspace(1)* %gep.0 @@ -22,7 +22,7 @@ ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -4.0 ; SI: buffer_store_dword [[REG]] -define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %x = load float, float addrspace(1)* %gep.0 @@ -37,7 +37,7 @@ ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]] ; SI: buffer_store_dword [[REG]] -define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %x = load float, float addrspace(1)* %gep.0 @@ -53,7 +53,7 @@ ; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[K]], |[[X]]| ; SI: buffer_store_dword [[REG]] -define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %x = load float, float addrspace(1)* %gep.0 @@ -68,7 +68,7 @@ ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]| ; SI: buffer_store_dword [[REG]] -define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -85,7 +85,7 @@ ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]] ; SI: buffer_store_dword [[REG]] -define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -102,7 +102,7 @@ ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]| ; SI: buffer_store_dword [[REG]] -define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -121,7 +121,7 @@ ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]| ; SI: buffer_store_dword [[REG]] -define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -139,7 +139,7 @@ ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]| ; SI: buffer_store_dword [[REG]] -define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -161,7 +161,7 @@ ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]| ; SI: buffer_store_dword [[RESULT]] -define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 Index: test/CodeGen/AMDGPU/concat_vectors.ll =================================================================== --- test/CodeGen/AMDGPU/concat_vectors.ll +++ test/CodeGen/AMDGPU/concat_vectors.ll @@ -8,7 +8,7 @@ ; value if we want to ensure scratch memory is not being used. ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { +define amdgpu_kernel void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8 ret void @@ -17,7 +17,7 @@ ; FUNC-LABEL: {{^}}test_concat_v2i32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { +define amdgpu_kernel void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16 ret void @@ -26,7 +26,7 @@ ; FUNC-LABEL: {{^}}test_concat_v4i32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { +define amdgpu_kernel void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32 ret void @@ -35,7 +35,7 @@ ; FUNC-LABEL: {{^}}test_concat_v8i32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { +define amdgpu_kernel void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64 ret void @@ -44,7 +44,7 @@ ; FUNC-LABEL: {{^}}test_concat_v16i32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind { +define amdgpu_kernel void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind { %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128 ret void @@ -53,7 +53,7 @@ ; FUNC-LABEL: {{^}}test_concat_v1f32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind { +define amdgpu_kernel void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind { %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8 ret void @@ -62,7 +62,7 @@ ; FUNC-LABEL: {{^}}test_concat_v2f32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { +define amdgpu_kernel void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16 ret void @@ -71,7 +71,7 @@ ; FUNC-LABEL: {{^}}test_concat_v4f32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { +define amdgpu_kernel void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32 ret void @@ -80,7 +80,7 @@ ; FUNC-LABEL: {{^}}test_concat_v8f32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { +define amdgpu_kernel void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64 ret void @@ -89,7 +89,7 @@ ; FUNC-LABEL: {{^}}test_concat_v16f32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { +define amdgpu_kernel void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128 ret void @@ -98,7 +98,7 @@ ; FUNC-LABEL: {{^}}test_concat_v1i64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 ret void @@ -107,7 +107,7 @@ ; FUNC-LABEL: {{^}}test_concat_v2i64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 ret void @@ -116,7 +116,7 @@ ; FUNC-LABEL: {{^}}test_concat_v4i64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 ret void @@ -125,7 +125,7 @@ ; FUNC-LABEL: {{^}}test_concat_v8i64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 ret void @@ -134,7 +134,7 @@ ; FUNC-LABEL: {{^}}test_concat_v16i64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 ret void @@ -143,7 +143,7 @@ ; FUNC-LABEL: {{^}}test_concat_v1f64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 ret void @@ -152,7 +152,7 @@ ; FUNC-LABEL: {{^}}test_concat_v2f64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 ret void @@ -161,7 +161,7 @@ ; FUNC-LABEL: {{^}}test_concat_v4f64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 ret void @@ -170,7 +170,7 @@ ; FUNC-LABEL: {{^}}test_concat_v8f64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 ret void @@ -179,7 +179,7 @@ ; FUNC-LABEL: {{^}}test_concat_v16f64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 ret void @@ -188,7 +188,7 @@ ; FUNC-LABEL: {{^}}test_concat_v1i1: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind { +define amdgpu_kernel void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind { %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> store <2 x i1> %concat, <2 x i1> addrspace(1)* %out ret void @@ -197,7 +197,7 @@ ; FUNC-LABEL: {{^}}test_concat_v2i1: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind { +define amdgpu_kernel void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind { %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> store <4 x i1> %concat, <4 x i1> addrspace(1)* %out ret void @@ -206,7 +206,7 @@ ; FUNC-LABEL: {{^}}test_concat_v4i1: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind { +define amdgpu_kernel void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind { %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> store <8 x i1> %concat, <8 x i1> addrspace(1)* %out ret void @@ -215,7 +215,7 @@ ; FUNC-LABEL: {{^}}test_concat_v8i1: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind { +define amdgpu_kernel void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind { %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> store <16 x i1> %concat, <16 x i1> addrspace(1)* %out ret void @@ -224,7 +224,7 @@ ; FUNC-LABEL: {{^}}test_concat_v16i1: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind { +define amdgpu_kernel void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind { %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> store <32 x i1> %concat, <32 x i1> addrspace(1)* %out ret void @@ -233,7 +233,7 @@ ; FUNC-LABEL: {{^}}test_concat_v32i1: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind { +define amdgpu_kernel void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind { %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> store <64 x i1> %concat, <64 x i1> addrspace(1)* %out ret void @@ -242,7 +242,7 @@ ; FUNC-LABEL: {{^}}test_concat_v1i16: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind { +define amdgpu_kernel void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind { %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4 ret void @@ -251,7 +251,7 @@ ; FUNC-LABEL: {{^}}test_concat_v2i16: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind { +define amdgpu_kernel void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind { %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8 ret void @@ -260,7 +260,7 @@ ; FUNC-LABEL: {{^}}test_concat_v4i16: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { +define amdgpu_kernel void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16 ret void @@ -269,7 +269,7 @@ ; FUNC-LABEL: {{^}}test_concat_v8i16: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { +define amdgpu_kernel void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32 ret void @@ -278,7 +278,7 @@ ; FUNC-LABEL: {{^}}test_concat_v16i16: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind { +define amdgpu_kernel void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind { %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64 ret void @@ -286,7 +286,7 @@ ; FUNC-LABEL: {{^}}concat_vector_crash: ; SI: s_endpgm -define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { +define amdgpu_kernel void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { bb: %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4 %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> Index: test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir =================================================================== --- test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -1,12 +1,12 @@ # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s --- | - define void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + define amdgpu_kernel void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %and = and i32 %a, 1234567 store volatile i32 %and, i32 addrspace(1)* %out ret void } - define void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { + define amdgpu_kernel void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom @@ -17,13 +17,13 @@ ret void } - define void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + define amdgpu_kernel void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %shl = shl i32 %a, 12 store volatile i32 %shl, i32 addrspace(1)* %out ret void } - define void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { + define amdgpu_kernel void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom @@ -34,13 +34,13 @@ ret void } - define void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + define amdgpu_kernel void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %ashr = ashr i32 %a, 12 store volatile i32 %ashr, i32 addrspace(1)* %out ret void } - define void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { + define amdgpu_kernel void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom @@ -51,13 +51,13 @@ ret void } - define void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + define amdgpu_kernel void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %lshr = lshr i32 %a, 12 store volatile i32 %lshr, i32 addrspace(1)* %out ret void } - define void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { + define amdgpu_kernel void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom Index: test/CodeGen/AMDGPU/constant-fold-mi-operands.ll =================================================================== --- test/CodeGen/AMDGPU/constant-fold-mi-operands.ll +++ test/CodeGen/AMDGPU/constant-fold-mi-operands.ll @@ -5,7 +5,7 @@ ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fold_mi_v_and_0(i32 addrspace(1)* %out) { +define amdgpu_kernel void @fold_mi_v_and_0(i32 addrspace(1)* %out) { %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %size = call i32 @llvm.amdgcn.groupstaticsize() %and = and i32 %size, %x @@ -17,7 +17,7 @@ ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 { +define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 { %size = call i32 @llvm.amdgcn.groupstaticsize() %and = and i32 %size, %x store i32 %and, i32 addrspace(1)* %out @@ -28,7 +28,7 @@ ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fold_mi_v_or_0(i32 addrspace(1)* %out) { +define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) { %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %size = call i32 @llvm.amdgcn.groupstaticsize() %or = or i32 %size, %x @@ -42,7 +42,7 @@ ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; GCN-NOT: [[VVAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 { +define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 { %size = call i32 @llvm.amdgcn.groupstaticsize() %or = or i32 %size, %x store i32 %or, i32 addrspace(1)* %out @@ -53,7 +53,7 @@ ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) { +define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) { %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %size = call i32 @llvm.amdgcn.groupstaticsize() %xor = xor i32 %size, %x @@ -67,7 +67,7 @@ ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; GCN-NOT: [[VVAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 { +define amdgpu_kernel void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 { %size = call i32 @llvm.amdgcn.groupstaticsize() %xor = xor i32 %size, %x store i32 %xor, i32 addrspace(1)* %out @@ -78,7 +78,7 @@ ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}} ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 { +define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 { %size = call i32 @llvm.amdgcn.groupstaticsize() %xor = xor i32 %size, -1 store i32 %xor, i32 addrspace(1)* %out @@ -91,7 +91,7 @@ ; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]] ; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}} ; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -define void @fold_mi_v_not_0(i64 addrspace(1)* %out) { +define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) { %vreg = load volatile i64, i64 addrspace(1)* undef %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg) %xor = xor i64 %ctpop, -1 @@ -110,7 +110,7 @@ ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]] ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -define void @fold_mi_or_neg1(i64 addrspace(1)* %out) { +define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) { %vreg0 = load volatile i64, i64 addrspace(1)* undef %vreg1 = load volatile i64, i64 addrspace(1)* undef %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0) @@ -126,7 +126,7 @@ ; GCN: v_not_b32 ; GCN: v_and_b32 ; GCN-NOT: v_and_b32 -define void @fold_mi_and_neg1(i64 addrspace(1)* %out) { +define amdgpu_kernel void @fold_mi_and_neg1(i64 addrspace(1)* %out) { %vreg0 = load volatile i64, i64 addrspace(1)* undef %vreg1 = load volatile i64, i64 addrspace(1)* undef %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0) Index: test/CodeGen/AMDGPU/control-flow-fastregalloc.ll =================================================================== --- test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -72,7 +72,7 @@ ; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]] -define void @divergent_if_endif(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %load0 = load volatile i32, i32 addrspace(3)* undef @@ -150,7 +150,7 @@ ; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]] -define void @divergent_loop(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %load0 = load volatile i32, i32 addrspace(3)* undef @@ -272,7 +272,7 @@ ; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]] -define void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %load0 = load volatile i32, i32 addrspace(3)* undef Index: test/CodeGen/AMDGPU/convergent-inlineasm.ll =================================================================== --- test/CodeGen/AMDGPU/convergent-inlineasm.ll +++ test/CodeGen/AMDGPU/convergent-inlineasm.ll @@ -6,7 +6,7 @@ ; GCN: v_cmp_ne_u32_e64 ; GCN: ; mask branch ; GCN: BB{{[0-9]+_[0-9]+}}: -define void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) { +define amdgpu_kernel void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) { bb: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1) #1 @@ -30,7 +30,7 @@ ; GCN: BB{{[0-9]+_[0-9]+}}: -define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) { +define amdgpu_kernel void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) { bb: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1) Index: test/CodeGen/AMDGPU/copy-illegal-type.ll =================================================================== --- test/CodeGen/AMDGPU/copy-illegal-type.ll +++ test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -8,7 +8,7 @@ ; GCN: buffer_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm -define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 ret void @@ -19,7 +19,7 @@ ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm -define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 @@ -32,7 +32,7 @@ ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm -define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 @@ -47,7 +47,7 @@ ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm -define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 @@ -65,7 +65,7 @@ ; GCN-DAG: buffer_store_dword ; GCN: s_endpgm -define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 %add = add <4 x i8> %val, store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 @@ -85,7 +85,7 @@ ; GCN: {{buffer|flat}}_store_dword ; GCN: {{buffer|flat}}_store_dword ; GCN: s_endpgm -define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 @@ -101,7 +101,7 @@ ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN: s_endpgm -define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 ret void @@ -113,7 +113,7 @@ ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN: s_endpgm -define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2 ret void @@ -128,7 +128,7 @@ ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: s_endpgm -define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1 ret void @@ -141,7 +141,7 @@ ; GCN: buffer_load_ubyte ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 ret void @@ -157,7 +157,7 @@ ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: s_endpgm -define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/copy-to-reg.ll =================================================================== --- test/CodeGen/AMDGPU/copy-to-reg.ll +++ test/CodeGen/AMDGPU/copy-to-reg.ll @@ -6,7 +6,7 @@ ; Make sure this doesn't crash ; CHECK-LABEL: {{^}}copy_to_reg_frameindex: -define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: %alloca = alloca [16 x i32] br label %loop Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -27,7 +27,7 @@ ; EG: FFBH_UINT ; EG: CNDE_INT -define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { +define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 ret void @@ -43,7 +43,7 @@ ; EG: FFBH_UINT ; EG: CNDE_INT -define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr, align 4 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 @@ -61,7 +61,7 @@ ; EG: CNDE_INT ; EG: FFBH_UINT ; EG: CNDE_INT -define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 @@ -89,7 +89,7 @@ ; EG-DAG: FFBH_UINT ; EG-DAG: CNDE_INT -define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 @@ -101,7 +101,7 @@ ; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_byte [[RESULT]], ; GCN: s_endpgm -define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { %val = load i8, i8 addrspace(1)* %valptr %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone store i8 %ctlz, i8 addrspace(1)* %out @@ -119,14 +119,14 @@ ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]] ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}} -define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind { +define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind { %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) store i64 %ctlz, i64 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}s_ctlz_i64_trunc: -define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { +define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) %trunc = trunc i64 %ctlz to i32 store i32 %trunc, i32 addrspace(1)* %out @@ -145,7 +145,7 @@ ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]] ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}} -define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid @@ -156,7 +156,7 @@ } ; FUNC-LABEL: {{^}}v_ctlz_i64_trunc: -define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -172,7 +172,7 @@ ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm - define void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp eq i32 %val, 0 @@ -186,7 +186,7 @@ ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp ne i32 %val, 0 @@ -202,7 +202,7 @@ ; GCN: v_cmp ; GCN: v_cndmask ; GCN: s_endpgm -define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp eq i32 %ctlz, 32 @@ -217,7 +217,7 @@ ; GCN: v_cmp ; GCN: v_cndmask ; GCN: s_endpgm -define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp ne i32 %ctlz, 32 @@ -230,7 +230,7 @@ ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] ; GCN: {{buffer|flat}}_store_byte [[FFBH]], - define void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid %val = load i8, i8 addrspace(1)* %valptr.gep @@ -245,7 +245,7 @@ ; SI: buffer_load_ushort [[VAL:v[0-9]+]], ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] ; SI: buffer_store_short [[FFBH]], - define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { %val = load i16, i16 addrspace(1)* %valptr %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone %cmp = icmp eq i16 %val, 0 @@ -260,7 +260,7 @@ ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] ; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]] ; GCN: {{buffer|flat}}_store_byte [[TRUNC]], -define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid %val = load i7, i7 addrspace(1)* %valptr.gep Index: test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -22,7 +22,7 @@ ; GCN: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { +define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 ret void @@ -35,7 +35,7 @@ ; GCN: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr, align 4 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 @@ -51,7 +51,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} ; EG: FFBH_UINT {{\*? *}}[[RESULT]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 @@ -71,7 +71,7 @@ ; EG: FFBH_UINT {{\*? *}}[[RESULT]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 @@ -82,7 +82,7 @@ ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_byte [[RESULT]], -define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { %val = load i8, i8 addrspace(1)* %valptr %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone store i8 %ctlz, i8 addrspace(1)* %out @@ -100,14 +100,14 @@ ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]] ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}} -define void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind { +define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind { %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) store i64 %ctlz, i64 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64_trunc: -define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { +define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) %trunc = trunc i64 %ctlz to i32 store i32 %trunc, i32 addrspace(1)* %out @@ -123,7 +123,7 @@ ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]] ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}} -define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid @@ -134,7 +134,7 @@ } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64_trunc: -define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -149,7 +149,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], - define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 @@ -162,7 +162,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], -define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 0 @@ -175,7 +175,7 @@ ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] ; GCN: {{buffer|flat}}_store_byte [[FFBH]], -define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid %val = load i8, i8 addrspace(1)* %valptr.gep @@ -194,7 +194,7 @@ ; GCN-DAG: buffer_store_dword [[RESULT0]] ; GCN-DAG: buffer_store_byte [[RESULT1]] ; GCN: s_endpgm - define void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 @@ -211,7 +211,7 @@ ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword - define void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 @@ -227,7 +227,7 @@ ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword -define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 0 @@ -243,7 +243,7 @@ ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword - define void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 1 @@ -259,7 +259,7 @@ ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword -define void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 1 Index: test/CodeGen/AMDGPU/ctpop.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop.ll +++ test/CodeGen/AMDGPU/ctpop.ll @@ -16,7 +16,7 @@ ; GCN: s_endpgm ; EG: BCNT_INT -define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { +define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone store i32 %ctpop, i32 addrspace(1)* %out, align 4 ret void @@ -30,7 +30,7 @@ ; GCN: s_endpgm ; EG: BCNT_INT -define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone store i32 %ctpop, i32 addrspace(1)* %out, align 4 @@ -48,7 +48,7 @@ ; EG: BCNT_INT ; EG: BCNT_INT -define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind { +define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind { %val0 = load i32, i32 addrspace(1)* %in0, align 4 %val1 = load i32, i32 addrspace(1)* %in1, align 4 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone @@ -64,7 +64,7 @@ ; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { +define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { %val0 = load i32, i32 addrspace(1)* %in0, align 4 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %add = add i32 %ctpop0, %sval @@ -79,7 +79,7 @@ ; EG: BCNT_INT ; EG: BCNT_INT -define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind { %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8 @@ -97,7 +97,7 @@ ; EG: BCNT_INT ; EG: BCNT_INT ; EG: BCNT_INT -define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind { %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16 @@ -123,7 +123,7 @@ ; EG: BCNT_INT ; EG: BCNT_INT ; EG: BCNT_INT -define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind { %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32 @@ -165,7 +165,7 @@ ; EG: BCNT_INT ; EG: BCNT_INT ; EG: BCNT_INT -define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind { %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32 %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32 @@ -179,7 +179,7 @@ ; GCN: s_endpgm ; EG: BCNT_INT -define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, 4 @@ -194,7 +194,7 @@ ; GCN: s_endpgm ; EG: BCNT_INT -define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 4, %ctpop @@ -209,7 +209,7 @@ ; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, 99999 @@ -225,7 +225,7 @@ ; GCN: s_endpgm ; EG: BCNT_INT -define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { +define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, %const @@ -241,7 +241,7 @@ ; GCN: s_endpgm ; EG: BCNT_INT -define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { +define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %const, %ctpop @@ -258,7 +258,7 @@ ; GCN: s_endpgm ; EG: BCNT_INT -define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind { +define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4 @@ -279,7 +279,7 @@ ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT -define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) { +define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) { entry: %tmp0 = icmp eq i32 %cond, 0 br i1 %tmp0, label %if, label %else Index: test/CodeGen/AMDGPU/ctpop64.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop64.ll +++ test/CodeGen/AMDGPU/ctpop64.ll @@ -17,7 +17,7 @@ ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; GCN: buffer_store_dword [[VRESULT]], ; GCN: s_endpgm -define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind { +define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind { %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone %truncctpop = trunc i64 %ctpop to i32 store i32 %truncctpop, i32 addrspace(1)* %out, align 4 @@ -31,7 +31,7 @@ ; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %val = load i64, i64 addrspace(1)* %in, align 8 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone %truncctpop = trunc i64 %ctpop to i32 @@ -48,7 +48,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}} ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} ; GCN: s_endpgm -define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind { +define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind { %val = load i64, i64 addrspace(1)* %in, align 8 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone %or = or i64 %ctpop, %s.val @@ -60,7 +60,7 @@ ; GCN: s_bcnt1_i32_b64 ; GCN: s_bcnt1_i32_b64 ; GCN: s_endpgm -define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind { +define amdgpu_kernel void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind { %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 @@ -73,7 +73,7 @@ ; GCN: s_bcnt1_i32_b64 ; GCN: s_bcnt1_i32_b64 ; GCN: s_endpgm -define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind { +define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind { %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 @@ -86,7 +86,7 @@ ; GCN: v_bcnt_u32_b32 ; GCN: v_bcnt_u32_b32 ; GCN: s_endpgm -define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> @@ -104,7 +104,7 @@ ; GCN: v_bcnt_u32_b32 ; GCN: v_bcnt_u32_b32 ; GCN: s_endpgm -define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind { %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> @@ -121,7 +121,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]] ; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}} ; GCN: s_endpgm -define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) { +define amdgpu_kernel void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) { entry: %tmp0 = icmp eq i32 %cond, 0 br i1 %tmp0, label %if, label %else @@ -146,7 +146,7 @@ ; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]], ; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT1]], [[SRESULT0]] ; GCN: s_endpgm -define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind { +define amdgpu_kernel void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind { %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone %truncctpop = trunc i128 %ctpop to i32 store i32 %truncctpop, i32 addrspace(1)* %out, align 4 @@ -159,7 +159,7 @@ ; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]], ; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]] ; GCN: s_endpgm -define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind { +define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind { %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone %truncctpop = trunc i65 %ctpop to i32 store i32 %truncctpop, i32 addrspace(1)* %out, align 4 @@ -181,7 +181,7 @@ ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind { %val = load i128, i128 addrspace(1)* %in, align 8 %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone %truncctpop = trunc i128 %ctpop to i32 Index: test/CodeGen/AMDGPU/cttz_zero_undef.ll =================================================================== --- test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -14,7 +14,7 @@ ; SI: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { +define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone store i32 %cttz, i32 addrspace(1)* %out, align 4 ret void @@ -27,7 +27,7 @@ ; SI: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr, align 4 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone store i32 %cttz, i32 addrspace(1)* %out, align 4 @@ -43,7 +43,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} ; EG: FFBL_INT {{\*? *}}[[RESULT]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8 @@ -63,7 +63,7 @@ ; EG: FFBL_INT {{\*? *}}[[RESULT]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 Index: test/CodeGen/AMDGPU/cube.ll =================================================================== --- test/CodeGen/AMDGPU/cube.ll +++ test/CodeGen/AMDGPU/cube.ll @@ -12,7 +12,7 @@ ; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: _store_dwordx4 -define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 { +define amdgpu_kernel void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 { %cubeid = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c) %cubesc = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c) %cubetc = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c) Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -10,7 +10,7 @@ ; GCN-NOT: lshr ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]] ; GCN: buffer_store_dword [[CONV]], -define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { %load = load i8, i8 addrspace(1)* %in, align 1 %cvt = uitofp i8 %load to float store float %cvt, float addrspace(1)* %out, align 4 @@ -22,7 +22,7 @@ ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 %cvt = uitofp <2 x i8> %load to <2 x float> store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 @@ -36,7 +36,7 @@ ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 %cvt = uitofp <3 x i8> %load to <3 x float> store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 @@ -52,7 +52,7 @@ ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 %cvt = uitofp <4 x i8> %load to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 @@ -76,7 +76,7 @@ ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]] ; GCN: buffer_store_dwordx4 -define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 %cvt = uitofp <4 x i8> %load to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 @@ -110,7 +110,7 @@ ; GCN: {{buffer|flat}}_store_dword ; GCN: s_endpgm -define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 @@ -124,7 +124,7 @@ ; Make sure this doesn't crash. ; GCN-LABEL: {{^}}load_v7i8_to_v7f32: ; GCN: s_endpgm -define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1 %cvt = uitofp <7 x i8> %load to <7 x float> store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 @@ -147,7 +147,7 @@ ; GCN-NOT: lshr ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8 %cvt = uitofp <8 x i8> %load to <8 x float> store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 @@ -159,7 +159,7 @@ ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]] ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] ; GCN: buffer_store_dword [[CONV]], -define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 2 %inreg = and i32 %add, 255 @@ -169,7 +169,7 @@ } ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: -define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %inreg = and i32 %load, 65280 %shr = lshr i32 %inreg, 8 @@ -181,7 +181,7 @@ ; We don't get these ones because of the zext, but instcombine removes ; them so it shouldn't really matter. ; GCN-LABEL: {{^}}i8_zext_i32_to_f32: -define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { %load = load i8, i8 addrspace(1)* %in, align 1 %ext = zext i8 %load to i32 %cvt = uitofp i32 %ext to float @@ -190,7 +190,7 @@ } ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32: -define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 %ext = zext <4 x i8> %load to <4 x i32> %cvt = uitofp <4 x i32> %ext to <4 x float> @@ -203,7 +203,7 @@ ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] -define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %and = and i32 %val, 255 %cvt = uitofp i32 %and to float @@ -216,7 +216,7 @@ ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] -define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %srl = lshr i32 %val, 8 %and = and i32 %srl, 255 @@ -230,7 +230,7 @@ ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] -define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %srl = lshr i32 %val, 16 %and = and i32 %srl, 255 @@ -244,7 +244,7 @@ ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] -define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %srl = lshr i32 %val, 24 %and = and i32 %srl, 255 Index: test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll +++ test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll @@ -10,7 +10,7 @@ ; SI-NOT: add ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; SI: s_endpgm -define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { %floor = call float @llvm.floor.f32(float %x) #1 %cvt = fptosi float %floor to i32 store i32 %cvt, i32 addrspace(1)* %out @@ -22,7 +22,7 @@ ; SI-SAFE-NOT: v_cvt_flr_i32_f32 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]] ; SI: s_endpgm -define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 { %fadd = fadd float %x, 1.0 %floor = call float @llvm.floor.f32(float %fadd) #1 %cvt = fptosi float %floor to i32 @@ -35,7 +35,7 @@ ; SI-SAFE-NOT: v_cvt_flr_i32_f32 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}| ; SI: s_endpgm -define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { %x.fabs = call float @llvm.fabs.f32(float %x) #1 %floor = call float @llvm.floor.f32(float %x.fabs) #1 %cvt = fptosi float %floor to i32 @@ -48,7 +48,7 @@ ; SI-SAFE-NOT: v_cvt_flr_i32_f32 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}} ; SI: s_endpgm -define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { %x.fneg = fsub float -0.000000e+00, %x %floor = call float @llvm.floor.f32(float %x.fneg) #1 %cvt = fptosi float %floor to i32 @@ -61,7 +61,7 @@ ; SI-SAFE-NOT: v_cvt_flr_i32_f32 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}| ; SI: s_endpgm -define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { %x.fabs = call float @llvm.fabs.f32(float %x) #1 %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1 @@ -75,7 +75,7 @@ ; SI: v_floor_f32 ; SI: v_cvt_u32_f32_e32 ; SI: s_endpgm -define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { %floor = call float @llvm.floor.f32(float %x) #1 %cvt = fptoui float %floor to i32 store i32 %cvt, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll +++ test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll @@ -9,7 +9,7 @@ ; SI-SAFE-NOT: v_cvt_rpi_i32_f32 ; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; SI: s_endpgm -define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 { %fadd = fadd float %x, 0.5 %floor = call float @llvm.floor.f32(float %fadd) #1 %cvt = fptosi float %floor to i32 @@ -21,7 +21,7 @@ ; SI-SAFE-NOT: v_cvt_rpi_i32_f32 ; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}} ; SI: s_endpgm -define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { %x.fabs = call float @llvm.fabs.f32(float %x) #1 %fadd = fadd float %x.fabs, 0.5 %floor = call float @llvm.floor.f32(float %fadd) #1 @@ -37,7 +37,7 @@ ; SI-SAFE-NOT: v_cvt_flr_i32_f32 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]] ; SI: s_endpgm -define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { %x.fneg = fsub float -0.000000e+00, %x %fadd = fadd float %x.fneg, 0.5 %floor = call float @llvm.floor.f32(float %fadd) #1 @@ -55,7 +55,7 @@ ; SI-SAFE-NOT: v_cvt_flr_i32_f32 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]] ; SI: s_endpgm -define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { %x.fabs = call float @llvm.fabs.f32(float %x) #1 %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs %fadd = fadd float %x.fabs.fneg, 0.5 @@ -71,7 +71,7 @@ ; SI: v_floor_f32 ; SI: v_cvt_u32_f32 ; SI: s_endpgm -define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { %fadd = fadd float %x, 0.5 %floor = call float @llvm.floor.f32(float %fadd) #1 %cvt = fptoui float %floor to i32 Index: test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll =================================================================== --- test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll +++ test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll @@ -9,7 +9,7 @@ ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]] ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]] -define void @store_same_base_ptr(i32 addrspace(1)* %out) { +define amdgpu_kernel void @store_same_base_ptr(i32 addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #0 %offset = sext i32 %id to i64 Index: test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll =================================================================== --- test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll +++ test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll @@ -10,7 +10,7 @@ ; CHECK: {{^}}sint: ; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %sint = load i32, i32 addrspace(1) * %in @@ -24,7 +24,7 @@ ;CHECK: {{^}}uint: ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %uint = load i32, i32 addrspace(1) * %in Index: test/CodeGen/AMDGPU/debug.ll =================================================================== --- test/CodeGen/AMDGPU/debug.ll +++ test/CodeGen/AMDGPU/debug.ll @@ -4,7 +4,7 @@ ; Test for a crash in the custom assembly dump code. ; SI: s_endpgm -define void @test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out) { store i32 0, i32 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/debugger-emit-prologue.ll =================================================================== --- test/CodeGen/AMDGPU/debugger-emit-prologue.ll +++ test/CodeGen/AMDGPU/debugger-emit-prologue.ll @@ -23,7 +23,7 @@ ; NOATTR-NOT: DebuggerPrivateSegmentBufferSGPR ; Function Attrs: nounwind -define void @test(i32 addrspace(1)* %A) #0 !dbg !12 { +define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 { entry: %A.addr = alloca i32 addrspace(1)*, align 4 store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 Index: test/CodeGen/AMDGPU/debugger-insert-nops.ll =================================================================== --- test/CodeGen/AMDGPU/debugger-insert-nops.ll +++ test/CodeGen/AMDGPU/debugger-insert-nops.ll @@ -22,7 +22,7 @@ ; CHECK-NEXT: s_endpgm ; Function Attrs: nounwind -define void @test(i32 addrspace(1)* %A) #0 !dbg !12 { +define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 { entry: %A.addr = alloca i32 addrspace(1)*, align 4 store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 Index: test/CodeGen/AMDGPU/debugger-reserve-regs.ll =================================================================== --- test/CodeGen/AMDGPU/debugger-reserve-regs.ll +++ test/CodeGen/AMDGPU/debugger-reserve-regs.ll @@ -6,7 +6,7 @@ ; CHECK-NEXT: ReservedVGPRCount: 4 ; Function Attrs: nounwind -define void @test(i32 addrspace(1)* %A) #0 !dbg !12 { +define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 { entry: %A.addr = alloca i32 addrspace(1)*, align 4 store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 Index: test/CodeGen/AMDGPU/default-fp-mode.ll =================================================================== --- test/CodeGen/AMDGPU/default-fp-mode.ll +++ test/CodeGen/AMDGPU/default-fp-mode.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}test_default_si: ; GCN: FloatMode: 192 ; GCN: IeeeMode: 1 -define void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 { +define amdgpu_kernel void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -12,7 +12,7 @@ ; GCN-LABEL: {{^}}test_default_vi: ; GCN: FloatMode: 192 ; GCN: IeeeMode: 1 -define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 { +define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -21,7 +21,7 @@ ; GCN-LABEL: {{^}}test_f64_denormals: ; GCN: FloatMode: 192 ; GCN: IeeeMode: 1 -define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 { +define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -30,7 +30,7 @@ ; GCN-LABEL: {{^}}test_f32_denormals: ; GCNL: FloatMode: 48 ; GCN: IeeeMode: 1 -define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 { +define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -39,7 +39,7 @@ ; GCN-LABEL: {{^}}test_f32_f64_denormals: ; GCN: FloatMode: 240 ; GCN: IeeeMode: 1 -define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 { +define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -48,7 +48,7 @@ ; GCN-LABEL: {{^}}test_no_denormals ; GCN: FloatMode: 0 ; GCN: IeeeMode: 1 -define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 { +define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -57,7 +57,7 @@ ; GCN-LABEL: {{^}}test_f16_f64_denormals: ; GCN: FloatMode: 192 ; GCN: IeeeMode: 1 -define void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #6 { +define amdgpu_kernel void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #6 { store half 0.0, half addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -66,7 +66,7 @@ ; GCN-LABEL: {{^}}test_no_f16_f64_denormals: ; GCN: FloatMode: 0 ; GCN: IeeeMode: 1 -define void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #7 { +define amdgpu_kernel void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #7 { store half 0.0, half addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -75,7 +75,7 @@ ; GCN-LABEL: {{^}}test_f32_f16_f64_denormals: ; GCN: FloatMode: 240 ; GCN: IeeeMode: 1 -define void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, float addrspace(1)* %out1, double addrspace(1)* %out2) #8 { +define amdgpu_kernel void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, float addrspace(1)* %out1, double addrspace(1)* %out2) #8 { store half 0.0, half addrspace(1)* %out0 store float 0.0, float addrspace(1)* %out1 store double 0.0, double addrspace(1)* %out2 Index: test/CodeGen/AMDGPU/detect-dead-lanes.mir =================================================================== --- test/CodeGen/AMDGPU/detect-dead-lanes.mir +++ test/CodeGen/AMDGPU/detect-dead-lanes.mir @@ -1,14 +1,14 @@ # RUN: llc -march=amdgcn -run-pass detect-dead-lanes -o - %s | FileCheck %s --- | - define void @test0() { ret void } - define void @test1() { ret void } - define void @test2() { ret void } - define void @test3() { ret void } - define void @test4() { ret void } - define void @test5() { ret void } - define void @loop0() { ret void } - define void @loop1() { ret void } - define void @loop2() { ret void } + define amdgpu_kernel void @test0() { ret void } + define amdgpu_kernel void @test1() { ret void } + define amdgpu_kernel void @test2() { ret void } + define amdgpu_kernel void @test3() { ret void } + define amdgpu_kernel void @test4() { ret void } + define amdgpu_kernel void @test5() { ret void } + define amdgpu_kernel void @loop0() { ret void } + define amdgpu_kernel void @loop1() { ret void } + define amdgpu_kernel void @loop2() { ret void } ... --- # Combined use/def transfer check, the basics. Index: test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll =================================================================== --- test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll +++ test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll @@ -9,7 +9,7 @@ ; CHECK: ALU_PUSH_BEFORE ; CHECK-NEXT: JUMP ; CHECK-NEXT: LOOP_BREAK -define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind { +define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind { entry: %cmp5 = icmp sgt i32 %iterations, 0 br i1 %cmp5, label %for.body, label %for.end Index: test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll =================================================================== --- test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll +++ test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll @@ -9,7 +9,7 @@ ; GCN: buffer_load_dword ; GCN: ds_write2_b32 ; GCN: s_endpgm -define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 { +define amdgpu_kernel void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 { entry: %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx = shl i32 %tid, 2 Index: test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll =================================================================== --- test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll +++ test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll @@ -23,7 +23,7 @@ ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:34 ; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256 ; CHECK: s_endpgm -define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 { +define amdgpu_kernel void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 { entry: %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %mul = shl nsw i32 %x.i, 1 Index: test/CodeGen/AMDGPU/ds-sub-offset.ll =================================================================== --- test/CodeGen/AMDGPU/ds-sub-offset.ll +++ test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -9,7 +9,7 @@ ; GCN: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]] ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12 -define void @write_ds_sub0_offset0_global() #0 { +define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 { entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 %sub1 = sub i32 0, %x.i @@ -24,7 +24,7 @@ ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13 ; GCN: ds_write_b8 [[NEG]], [[K]] offset:65535 -define void @add_x_shl_neg_to_sub_max_offset() #1 { +define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -39,7 +39,7 @@ ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x10000, [[SCALED]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13 ; GCN: ds_write_b8 [[NEG]], [[K]]{{$}} -define void @add_x_shl_neg_to_sub_max_offset_p1() #1 { +define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -58,7 +58,7 @@ ; GCN-NOT: v_sub ; GCN: ds_write_b32 [[NEG]], [[K]] offset:456{{$}} ; GCN: s_endpgm -define void @add_x_shl_neg_to_sub_multi_use() #1 { +define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -80,7 +80,7 @@ ; GCN-NOT: v_sub ; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}} ; GCN: s_endpgm -define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 { +define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -95,7 +95,7 @@ ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset0:254 offset1:255 -define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { +define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -109,7 +109,7 @@ ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x3fc, [[SCALED]] ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}} -define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 { +define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 Index: test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2.ll +++ test/CodeGen/AMDGPU/ds_read2.ll @@ -12,7 +12,7 @@ ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @simple_read2_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 4 @@ -31,7 +31,7 @@ ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 4 @@ -49,7 +49,7 @@ ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 ; SI: s_endpgm -define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 4 @@ -66,7 +66,7 @@ ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 ; SI: s_endpgm -define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 0 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 @@ -98,7 +98,7 @@ ; SI: s_barrier ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 ; SI: s_endpgm -define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 0 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 @@ -133,7 +133,7 @@ ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 ; SI: s_endpgm -define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 @@ -170,7 +170,7 @@ ; SI: ds_read_b32 ; SI: ds_read_b32 ; SI: s_endpgm -define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { +define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 @@ -196,7 +196,7 @@ ; SI: ds_read_b32 ; SI: ds_read_b32 ; SI: s_endpgm -define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { +define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 @@ -219,7 +219,7 @@ ; SI-LABEL: {{^}}read2_ptr_is_subreg_f32: ; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}} ; SI: s_endpgm -define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0 %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1 @@ -243,7 +243,7 @@ ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 ; SI: s_endpgm -define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4 @@ -261,7 +261,7 @@ ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 ; SI: s_endpgm -define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 4 @@ -280,7 +280,7 @@ ; SI-LABEL: @unaligned_read2_f32 ; SI-NOT: ds_read2_b32 ; SI: s_endpgm -define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { +define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 1 @@ -296,7 +296,7 @@ ; SI-LABEL: @misaligned_2_simple_read2_f32 ; SI-NOT: ds_read2_b32 ; SI: s_endpgm -define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { +define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 2 @@ -315,7 +315,7 @@ ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} ; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm -define void @simple_read2_f64(double addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 8 @@ -331,7 +331,7 @@ ; SI-LABEL: @simple_read2_f64_max_offset ; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255 ; SI: s_endpgm -define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 8 @@ -349,7 +349,7 @@ ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056 ; SI: s_endpgm -define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 8 @@ -367,7 +367,7 @@ ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15 ; SI: s_endpgm -define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 4 @@ -385,7 +385,7 @@ ; SI-LABEL: @load_constant_adjacent_offsets ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 -define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { +define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 %sum = add i32 %val0, %val1 @@ -396,7 +396,7 @@ ; SI-LABEL: @load_constant_disjoint_offsets ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2 -define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { +define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 %sum = add i32 %val0, %val1 @@ -410,7 +410,7 @@ ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 -define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { +define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 %sum = add i64 %val0, %val1 @@ -426,7 +426,7 @@ ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 ; SI: s_endpgm -define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { +define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 %sum = add i64 %val0, %val1 @@ -437,7 +437,7 @@ @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 -define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 { +define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 { %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i @@ -481,13 +481,13 @@ ret void } -define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 { %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8 ret void } -define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 { +define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 { %load = load i64, i64 addrspace(3)* %in, align 4 store i64 %load, i64 addrspace(1)* %out, align 8 ret void Index: test/CodeGen/AMDGPU/ds_read2_offset_order.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_offset_order.ll +++ test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -10,7 +10,7 @@ ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12 -define void @offset_order(float addrspace(1)* %out) { +define amdgpu_kernel void @offset_order(float addrspace(1)* %out) { entry: %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0 %val0 = load float, float addrspace(3)* %ptr0 Index: test/CodeGen/AMDGPU/ds_read2_superreg.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -12,7 +12,7 @@ ; CI: s_waitcnt lgkmcnt(0) ; CI: buffer_store_dwordx2 [[RESULT]] ; CI: s_endpgm -define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0, align 4 @@ -26,7 +26,7 @@ ; CI: s_waitcnt lgkmcnt(0) ; CI: buffer_store_dwordx2 [[RESULT]] ; CI: s_endpgm -define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0 @@ -43,7 +43,7 @@ ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD2]] ; CI: s_endpgm -define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 4 @@ -68,7 +68,7 @@ ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD1]] ; CI: s_endpgm -define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <3 x float>], [512 x <3 x float>] addrspace(3)* @lds.v3, i32 0, i32 %x.i %val0 = load <3 x float>, <3 x float> addrspace(3)* %arrayidx0, align 4 @@ -88,7 +88,7 @@ ; CI: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}} ; CI: buffer_store_dwordx4 [[REG_ZW]] ; CI: s_endpgm -define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8 @@ -101,7 +101,7 @@ ; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}} ; CI: buffer_store_dwordx4 [[REG_ZW]] ; CI: s_endpgm -define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0 @@ -117,7 +117,7 @@ ; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}} ; CI: s_endpgm -define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0 @@ -138,7 +138,7 @@ ; CI-DAG: buffer_store_dwordx4 [[VEC8_11]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32 ; CI-DAG: buffer_store_dwordx4 [[VEC12_15]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 ; CI: s_endpgm -define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0 @@ -153,7 +153,7 @@ ; CI-NOT: v_mov ; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}} ; CI: s_endpgm -define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1 @@ -176,7 +176,7 @@ ; CI-NOT: v_mov ; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}} ; CI: s_endpgm -define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1 Index: test/CodeGen/AMDGPU/ds_read2st64.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2st64.ll +++ test/CodeGen/AMDGPU/ds_read2st64.ll @@ -10,7 +10,7 @@ ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 4 @@ -29,7 +29,7 @@ ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 @@ -49,7 +49,7 @@ ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 @@ -69,7 +69,7 @@ ; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}} ; SI: s_endpgm -define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 @@ -86,7 +86,7 @@ ; SI-LABEL: @odd_invalid_read2st64_f32_0 ; SI-NOT: ds_read2st64_b32 ; SI: s_endpgm -define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 4 @@ -102,7 +102,7 @@ ; SI-LABEL: @odd_invalid_read2st64_f32_1 ; SI-NOT: ds_read2st64_b32 ; SI: s_endpgm -define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0 @@ -122,7 +122,7 @@ ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} ; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm -define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 8 @@ -141,7 +141,7 @@ ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} ; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm -define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 @@ -161,7 +161,7 @@ ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129 ; SI: s_endpgm -define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 4 @@ -181,7 +181,7 @@ ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} ; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm -define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 256 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 @@ -201,7 +201,7 @@ ; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}} ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]] ; SI: s_endpgm -define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 @@ -218,7 +218,7 @@ ; SI-LABEL: @invalid_read2st64_f64_odd_offset ; SI-NOT: ds_read2st64_b64 ; SI: s_endpgm -define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 @@ -239,7 +239,7 @@ ; SI-NOT: ds_read2st_b64 ; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8 ; SI: s_endpgm -define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 8 Index: test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2.ll +++ test/CodeGen/AMDGPU/ds_write2.ll @@ -9,7 +9,7 @@ ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8 ; SI: s_endpgm -define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i %val = load float, float addrspace(1)* %in.gep, align 4 @@ -27,7 +27,7 @@ ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 ; SI: s_endpgm -define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 @@ -46,7 +46,7 @@ ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 ; SI: s_endpgm -define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { +define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i @@ -65,7 +65,7 @@ ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 ; SI: s_endpgm -define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { +define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i @@ -86,7 +86,7 @@ ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; SI: s_endpgm -define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 @@ -107,7 +107,7 @@ ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; SI: s_endpgm -define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8 @@ -126,7 +126,7 @@ ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; SI: s_endpgm -define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 @@ -146,7 +146,7 @@ ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 ; SI: s_endpgm -define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 @@ -164,7 +164,7 @@ ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 ; SI: s_endpgm -define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { +define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i @@ -182,7 +182,7 @@ ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 ; SI: s_endpgm -define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { +define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x @@ -212,7 +212,7 @@ ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 ; SI: s_endpgm -define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { +define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x @@ -243,7 +243,7 @@ ; SI: ds_write_b32 ; SI: ds_write_b32 ; SI: s_endpgm -define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { +define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i @@ -270,7 +270,7 @@ ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} ; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 ; SI: s_endpgm -define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i %val = load double, double addrspace(1)* %in.gep, align 8 @@ -288,7 +288,7 @@ ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 ; SI: s_endpgm -define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i %val = load double, double addrspace(1)* %in.gep, align 8 @@ -306,7 +306,7 @@ ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} ; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 ; SI: s_endpgm -define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 @@ -325,7 +325,7 @@ ; SI-LABEL: @store_constant_adjacent_offsets ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -define void @store_constant_adjacent_offsets() { +define amdgpu_kernel void @store_constant_adjacent_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 ret void @@ -335,7 +335,7 @@ ; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} ; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 -define void @store_constant_disjoint_offsets() { +define amdgpu_kernel void @store_constant_disjoint_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 ret void @@ -348,7 +348,7 @@ ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 ; SI: s_endpgm -define void @store_misaligned64_constant_offsets() { +define amdgpu_kernel void @store_misaligned64_constant_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 ret void @@ -362,7 +362,7 @@ ; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 ; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 ; SI: s_endpgm -define void @store_misaligned64_constant_large_offsets() { +define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 ret void @@ -371,7 +371,7 @@ @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 -define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 %val = load float, float addrspace(1)* %in @@ -410,7 +410,7 @@ ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}} ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}} ; CI: s_endpgm -define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4 Index: test/CodeGen/AMDGPU/ds_write2st64.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2st64.ll +++ test/CodeGen/AMDGPU/ds_write2st64.ll @@ -7,7 +7,7 @@ ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1 ; SI: s_endpgm -define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i %val = load float, float addrspace(1)* %in.gep, align 4 @@ -25,7 +25,7 @@ ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5 ; SI: s_endpgm -define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 @@ -46,7 +46,7 @@ ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 ; SI: s_endpgm -define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 @@ -66,7 +66,7 @@ ; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], ; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127 ; SI: s_endpgm -define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 @@ -85,7 +85,7 @@ ; SI-NOT: ds_write2st64_b64 ; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8 ; SI: s_endpgm -define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i %val = load double, double addrspace(1)* %in.gep, align 8 Index: test/CodeGen/AMDGPU/dynamic_stackalloc.ll =================================================================== --- test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -4,7 +4,7 @@ ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -define void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) { +define amdgpu_kernel void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) { %alloca = alloca i32, i32 %n store volatile i32 0, i32* %alloca ret void Index: test/CodeGen/AMDGPU/early-if-convert-cost.ll =================================================================== --- test/CodeGen/AMDGPU/early-if-convert-cost.ll +++ test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -10,7 +10,7 @@ ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_LO:[0-9]+]], v[[ADD_LO]], v[[VAL_LO]], vcc ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_HI:[0-9]+]], v[[ADD_HI]], v[[VAL_HI]], vcc ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -define void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { entry: %v = load double, double addrspace(1)* %in %cc = fcmp oeq double %v, 1.000000e+00 @@ -32,7 +32,7 @@ ; GCN: v_add_f64 ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 -define void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 { +define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 { entry: %v = load double, double addrspace(2)* %in %cc = fcmp oeq double %v, 1.000000e+00 @@ -62,7 +62,7 @@ ; GCN-DAG: buffer_store_dword v ; GCN-DAG: buffer_store_dwordx2 -define void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 { entry: %v = load <3 x i32>, <3 x i32> addrspace(1)* %in %cc = fcmp oeq float %cnd, 1.000000e+00 @@ -93,7 +93,7 @@ ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc ; GCN: buffer_store_dwordx4 -define void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 { entry: %v = load <4 x i32>, <4 x i32> addrspace(1)* %in %cc = fcmp oeq float %cnd, 1.000000e+00 Index: test/CodeGen/AMDGPU/early-if-convert.ll =================================================================== --- test/CodeGen/AMDGPU/early-if-convert.ll +++ test/CodeGen/AMDGPU/early-if-convert.ll @@ -9,7 +9,7 @@ ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]] ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc ; GCN: buffer_store_dword [[RESULT]] -define void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %v = load float, float addrspace(1)* %in %cc = fcmp oeq float %v, 1.000000e+00 @@ -32,7 +32,7 @@ ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]] ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc ; GCN: buffer_store_dword [[RESULT]] -define void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %v = load float, float addrspace(1)* %in %cc = fcmp oeq float %v, 1.000000e+00 @@ -58,7 +58,7 @@ ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc ; GCN: s_mov_b64 vcc, [[CMP]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc -define void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 { entry: %v = load i32, i32 addrspace(1)* %in %cc = fcmp oeq float %k, 1.000000e+00 @@ -87,7 +87,7 @@ ; GCN: v_mul_f32 ; GCN: v_mul_f32 ; GCN: v_cndmask_b32_e32 -define void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %v = load float, float addrspace(1)* %in %cc = fcmp oeq float %v, 1.000000e+00 @@ -128,7 +128,7 @@ ; GCN: [[ENDIF]]: ; GCN: buffer_store_dword -define void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %v = load float, float addrspace(1)* %in %cc = fcmp oeq float %v, 1.000000e+00 @@ -162,7 +162,7 @@ ; GCN: [[ENDIF]]: ; GCN: buffer_store_dword -define void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %v = load float, float addrspace(1)* %in %cc = fcmp oeq float %v, 1.000000e+00 @@ -187,7 +187,7 @@ ; GCN: [[ENDIF]]: ; GCN: buffer_store_dword -define void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 { +define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 { entry: %v = load i32, i32 addrspace(2)* %in %cc = fcmp oeq float %cnd, 1.000000e+00 @@ -206,7 +206,7 @@ ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load: ; GCN: v_cndmask_b32 -define void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 { entry: %v = load float, float addrspace(2)* %in %cc = fcmp oeq float %v, 1.000000e+00 @@ -227,7 +227,7 @@ ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload: ; GCN: v_cndmask_b32 -define void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 { entry: %cc = fcmp oeq float %v, 1.000000e+00 br i1 %cc, label %if, label %endif @@ -248,7 +248,7 @@ ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]] ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]] -define void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 { +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 { entry: %v = load i32, i32 addrspace(2)* %in %cc = icmp eq i32 %cond, 1 @@ -274,7 +274,7 @@ ; GCN: [[ENDIF]]: ; GCN: buffer_store_dword -define void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 { +define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 { entry: %v = load float, float addrspace(1)* %in %cc = icmp eq i32 %cond, 1 @@ -295,7 +295,7 @@ ; GCN: s_addc_u32 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} -define void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 { +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 { entry: %v = load i64, i64 addrspace(2)* %in %cc = icmp eq i32 %cond, 1 @@ -320,7 +320,7 @@ ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} -define void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 { +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 { entry: %v = load <3 x i32>, <3 x i32> addrspace(2)* %in %cc = icmp eq i32 %cond, 1 @@ -345,7 +345,7 @@ ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} -define void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 { +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 { entry: %v = load <4 x i32>, <4 x i32> addrspace(2)* %in %cc = icmp eq i32 %cond, 1 @@ -364,7 +364,7 @@ ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select: ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 ; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}} -define void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %else, label %if @@ -385,7 +385,7 @@ ; GCN: {{^}}; BB#0: ; GCN-NEXT: s_load_dwordx2 ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0 -define void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) { entry: br i1 undef, label %else, label %if @@ -410,7 +410,7 @@ ; GCN: [[ENDIF]]: ; GCN: buffer_store_dword -define void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 { entry: %v = load <8 x i32>, <8 x i32> addrspace(1)* %in %cc = fcmp oeq float %cnd, 1.000000e+00 @@ -435,7 +435,7 @@ ; GCN: [[ENDIF]]: ; GCN: buffer_store_dword -define void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 { entry: %v = load <16 x i32>, <16 x i32> addrspace(1)* %in %cc = fcmp oeq float %cnd, 1.000000e+00 Index: test/CodeGen/AMDGPU/elf.r600.ll =================================================================== --- test/CodeGen/AMDGPU/elf.r600.ll +++ test/CodeGen/AMDGPU/elf.r600.ll @@ -9,7 +9,7 @@ ; CONFIG-NEXT: .long 2 ; CONFIG-NEXT: .long 165900 ; CONFIG-NEXT: .long 0 -define void @test(float addrspace(1)* %out, i32 %p) { +define amdgpu_kernel void @test(float addrspace(1)* %out, i32 %p) { %i = add i32 %p, 2 %r = bitcast i32 %i to float store float %r, float addrspace(1)* %out Index: test/CodeGen/AMDGPU/empty-function.ll =================================================================== --- test/CodeGen/AMDGPU/empty-function.ll +++ test/CodeGen/AMDGPU/empty-function.ll @@ -7,14 +7,14 @@ ; SI-LABEL: {{^}}empty_function_ret: ; SI: s_endpgm ; SI: codeLenInByte = 4 -define void @empty_function_ret() #0 { +define amdgpu_kernel void @empty_function_ret() #0 { ret void } ; SI: .text ; SI-LABEL: {{^}}empty_function_unreachable: ; SI: codeLenInByte = 0 -define void @empty_function_unreachable() #0 { +define amdgpu_kernel void @empty_function_unreachable() #0 { unreachable } Index: test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll =================================================================== --- test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll +++ test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll @@ -9,7 +9,7 @@ ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]] ; GCN-UNSAFE-NOT: xor -define void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %a = load float, float addrspace(1)* %in, align 4 %b = load float, float addrspace(1)* %b_ptr, align 4 Index: test/CodeGen/AMDGPU/endcf-loop-header.ll =================================================================== --- test/CodeGen/AMDGPU/endcf-loop-header.ll +++ test/CodeGen/AMDGPU/endcf-loop-header.ll @@ -12,7 +12,7 @@ ; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}} ; CHECK-NOT: s_or_b64 exec, exec ; CHECK: s_cbranch_execnz [[LOOP_LABEL]] -define void @test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out) { entry: %cond = call i32 @llvm.r600.read.tidig.x() #0 %tmp0 = icmp eq i32 %cond, 0 Index: test/CodeGen/AMDGPU/exceed-max-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/exceed-max-sgprs.ll +++ test/CodeGen/AMDGPU/exceed-max-sgprs.ll @@ -1,7 +1,7 @@ ; RUN: not llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_tahiti -define void @use_too_many_sgprs_tahiti() #0 { +define amdgpu_kernel void @use_too_many_sgprs_tahiti() #0 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" () @@ -20,7 +20,7 @@ } ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire -define void @use_too_many_sgprs_bonaire() #1 { +define amdgpu_kernel void @use_too_many_sgprs_bonaire() #1 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" () @@ -39,7 +39,7 @@ } ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr -define void @use_too_many_sgprs_bonaire_flat_scr() #1 { +define amdgpu_kernel void @use_too_many_sgprs_bonaire_flat_scr() #1 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" () @@ -59,7 +59,7 @@ } ; ERROR: error: scalar registers limit of 96 exceeded (98) in use_too_many_sgprs_iceland -define void @use_too_many_sgprs_iceland() #2 { +define amdgpu_kernel void @use_too_many_sgprs_iceland() #2 { call void asm sideeffect "", "~{VCC}" () call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () @@ -77,7 +77,7 @@ } ; ERROR: error: addressable scalar registers limit of 102 exceeded (103) in use_too_many_sgprs_fiji -define void @use_too_many_sgprs_fiji() #3 { +define amdgpu_kernel void @use_too_many_sgprs_fiji() #3 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" () Index: test/CodeGen/AMDGPU/extend-bit-ops-i16.ll =================================================================== --- test/CodeGen/AMDGPU/extend-bit-ops-i16.ll +++ test/CodeGen/AMDGPU/extend-bit-ops-i16.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: and_zext: ; GCN: v_and_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]] -define void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id %a = load i16, i16 addrspace(1)* %in @@ -18,7 +18,7 @@ ; GCN-LABEL: or_zext: ; GCN: v_or_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]] -define void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id %a = load i16, i16 addrspace(1)* %in @@ -33,7 +33,7 @@ ; GCN-LABEL: xor_zext: ; GCN: v_xor_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]] -define void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id %a = load i16, i16 addrspace(1)* %in Index: test/CodeGen/AMDGPU/extload-align.ll =================================================================== --- test/CodeGen/AMDGPU/extload-align.ll +++ test/CodeGen/AMDGPU/extload-align.ll @@ -9,7 +9,7 @@ ; DEBUG: mem:LD2[]{{[^(]}} ; DEBUG: {{^}}# End machine code for function extload_align. -define void @extload_align(i32* %out, i32 %index) #0 { +define amdgpu_kernel void @extload_align(i32* %out, i32 %index) #0 { %v0 = alloca [4 x i16] %a1 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 0 %a2 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 1 Index: test/CodeGen/AMDGPU/extload-private.ll =================================================================== --- test/CodeGen/AMDGPU/extload-private.ll +++ test/CodeGen/AMDGPU/extload-private.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}load_i8_sext_private: ; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} -define void @load_i8_sext_private(i32 addrspace(1)* %out) { +define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i8 %tmp1 = load i8, i8* %tmp0 @@ -14,7 +14,7 @@ ; FUNC-LABEL: {{^}}load_i8_zext_private: ; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} -define void @load_i8_zext_private(i32 addrspace(1)* %out) { +define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i8 %tmp1 = load i8, i8* %tmp0 @@ -25,7 +25,7 @@ ; FUNC-LABEL: {{^}}load_i16_sext_private: ; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} -define void @load_i16_sext_private(i32 addrspace(1)* %out) { +define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16 %tmp1 = load i16, i16* %tmp0 @@ -36,7 +36,7 @@ ; FUNC-LABEL: {{^}}load_i16_zext_private: ; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} -define void @load_i16_zext_private(i32 addrspace(1)* %out) { +define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16 %tmp1 = load volatile i16, i16* %tmp0 Index: test/CodeGen/AMDGPU/extload.ll =================================================================== --- test/CodeGen/AMDGPU/extload.ll +++ test/CodeGen/AMDGPU/extload.ll @@ -10,7 +10,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], ; EG: VTX_READ_32 [[VAL]] -define void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind { +define amdgpu_kernel void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind { %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)* %load = load i32, i32 addrspace(1)* %cast %x = bitcast i32 %load to <4 x i8> @@ -25,7 +25,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], ; EG: VTX_READ_32 [[VAL]] -define void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind { +define amdgpu_kernel void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind { %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)* %load = load i32, i32 addrspace(1)* %cast %x = bitcast i32 %load to <2 x i16> @@ -40,7 +40,7 @@ ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]] ; EG: LDS_WRITE * [[VAL]] -define void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind { +define amdgpu_kernel void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind { %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)* %load = load i32, i32 addrspace(3)* %cast %x = bitcast i32 %load to <4 x i8> @@ -55,7 +55,7 @@ ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]] ; EG: LDS_WRITE * [[VAL]] -define void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind { +define amdgpu_kernel void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind { %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)* %load = load i32, i32 addrspace(3)* %cast %x = bitcast i32 %load to <2 x i16> Index: test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll =================================================================== --- test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll +++ test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll @@ -13,7 +13,7 @@ ; GCN: buffer_store_dword ; GCN: buffer_store_dword ; GCN: buffer_store_dword -define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0, +define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0, <4 x i32> addrspace(1)* noalias %out1, i32 addrspace(1)* noalias %out2, i32 addrspace(1)* %in) { @@ -55,7 +55,7 @@ ; GCN: buffer_store_dword ; GCN: buffer_store_dword ; GCN: buffer_store_dword -define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0, +define amdgpu_kernel void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0, <4 x i32> addrspace(1)* noalias %out1, i32 addrspace(1)* noalias %out2, i32 addrspace(1)* %in) { @@ -99,7 +99,7 @@ ; GCN: buffer_store_dwordx2 ; GCN: buffer_store_dwordx2 -define void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0, +define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0, <4 x i32> addrspace(1)* noalias %out1, i64 addrspace(1)* noalias %out2, i32 addrspace(1)* %in) { Index: test/CodeGen/AMDGPU/extract_vector_elt-f16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -8,7 +8,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] ; GCN-DAG: buffer_store_short [[VELT0]] ; GCN-DAG: buffer_store_short [[VELT1]] -define void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { +define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr %p0 = extractelement <2 x half> %vec, i32 0 %p1 = extractelement <2 x half> %vec, i32 1 @@ -26,7 +26,7 @@ ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] ; GCN: buffer_store_short [[VELT1]] ; GCN: ScratchSize: 0 -define void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 { +define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 { %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr %elt = extractelement <2 x half> %vec, i32 %idx store half %elt, half addrspace(1)* %out, align 2 @@ -45,7 +45,7 @@ ; SI: buffer_store_short [[ELT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]] ; GCN: ScratchSize: 0{{$}} -define void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { +define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext @@ -61,7 +61,7 @@ ; GCN: buffer_load_ushort ; GCN: buffer_store_short ; GCN: buffer_store_short -define void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 { %p0 = extractelement <3 x half> %foo, i32 0 %p1 = extractelement <3 x half> %foo, i32 2 %out1 = getelementptr half, half addrspace(1)* %out, i32 1 @@ -75,7 +75,7 @@ ; GCN: buffer_load_ushort ; GCN: buffer_store_short ; GCN: buffer_store_short -define void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 { %p0 = extractelement <4 x half> %foo, i32 0 %p1 = extractelement <4 x half> %foo, i32 2 %out1 = getelementptr half, half addrspace(1)* %out, i32 10 @@ -95,7 +95,7 @@ ; GCN: buffer_load_ushort ; GCN: buffer_store_short -define void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 { +define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 { %p0 = extractelement <3 x half> %foo, i32 %idx %out1 = getelementptr half, half addrspace(1)* %out, i32 1 store half %p0, half addrspace(1)* %out @@ -115,7 +115,7 @@ ; GCN: buffer_load_ushort ; GCN: buffer_store_short -define void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 { +define amdgpu_kernel void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 { %p0 = extractelement <4 x half> %foo, i32 %idx %out1 = getelementptr half, half addrspace(1)* %out, i32 1 store half %p0, half addrspace(1)* %out Index: test/CodeGen/AMDGPU/extract_vector_elt-f64.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-f64.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-f64.ll @@ -5,7 +5,7 @@ ; GCN: buffer_load_dwordx4 ; GCN: buffer_load_dwordx2 ; GCN: buffer_store_dwordx2 -define void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { %ld = load volatile <3 x double>, <3 x double> addrspace(1)* %in %elt = extractelement <3 x double> %ld, i32 2 store volatile double %elt, double addrspace(1)* %out @@ -13,14 +13,14 @@ } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64: -define void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 { +define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 { %dynelt = extractelement <3 x double> %foo, i32 %elt store volatile double %dynelt, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64: -define void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 { +define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 { %dynelt = extractelement <4 x double> %foo, i32 %elt store volatile double %dynelt, double addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/extract_vector_elt-i16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -9,7 +9,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] ; GCN-DAG: buffer_store_short [[VELT0]] ; GCN-DAG: buffer_store_short [[VELT1]] -define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { +define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %p0 = extractelement <2 x i16> %vec, i32 0 %p1 = extractelement <2 x i16> %vec, i32 1 @@ -27,7 +27,7 @@ ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] ; GCN: buffer_store_short [[VELT1]] ; GCN: ScratchSize: 0 -define void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 { +define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %elt = extractelement <2 x i16> %vec, i32 %idx store i16 %elt, i16 addrspace(1)* %out, align 2 @@ -45,7 +45,7 @@ ; SI: buffer_store_short [[ELT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]] ; GCN: ScratchSize: 0{{$}} -define void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { +define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext @@ -61,7 +61,7 @@ ; GCN: buffer_load_ushort ; GCN: buffer_store_short ; GCN: buffer_store_short -define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 { %p0 = extractelement <3 x i16> %foo, i32 0 %p1 = extractelement <3 x i16> %foo, i32 2 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 @@ -82,7 +82,7 @@ ; GFX9-DAG: buffer_store_short [[VLOAD0]], off ; GFX9-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], [[LOAD1]] ; GFX9-DAG: buffer_store_short [[VLOAD1]], off -define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 { %p0 = extractelement <4 x i16> %foo, i32 0 %p1 = extractelement <4 x i16> %foo, i32 2 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10 @@ -105,7 +105,7 @@ ; GCN: buffer_load_ushort ; GCN: buffer_store_short -define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 { +define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 { %p0 = extractelement <3 x i16> %foo, i32 %idx %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 %p0, i16 addrspace(1)* %out @@ -131,7 +131,7 @@ ; GFX9: buffer_store_dword ; GFX9: buffer_load_ushort ; GFX9: buffer_store_short -define void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 { +define amdgpu_kernel void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 { %p0 = extractelement <4 x i16> %foo, i32 %idx %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 %p0, i16 addrspace(1)* %out Index: test/CodeGen/AMDGPU/extract_vector_elt-i64.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i64.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i64.ll @@ -8,7 +8,7 @@ ; GCN: buffer_store_dword ; GCN: buffer_store_dword ; GCN: buffer_store_dwordx2 -define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 { +define amdgpu_kernel void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 { %vec = bitcast i64 %val to <2 x i32> %elt0 = extractelement <2 x i32> %vec, i32 0 %elt1 = extractelement <2 x i32> %vec, i32 1 @@ -20,7 +20,7 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v2i64: -define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 { %p0 = extractelement <2 x i64> %foo, i32 0 %p1 = extractelement <2 x i64> %foo, i32 1 %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1 @@ -30,14 +30,14 @@ } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64: -define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 { +define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <2 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2: -define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 { +define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 { %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo %or = or <2 x i64> %load, %arst %dynelt = extractelement <2 x i64> %or, i32 %elt @@ -46,14 +46,14 @@ } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64: -define void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 { +define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <3 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64: -define void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 { +define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <4 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/extract_vector_elt-i8.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}extract_vector_elt_v1i8: ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte -define void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 { %p0 = extractelement <1 x i8> %foo, i32 0 store i8 %p0, i8 addrspace(1)* %out ret void @@ -15,7 +15,7 @@ ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 { %p0 = extractelement <2 x i8> %foo, i32 0 %p1 = extractelement <2 x i8> %foo, i32 1 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -29,7 +29,7 @@ ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 { %p0 = extractelement <3 x i8> %foo, i32 0 %p1 = extractelement <3 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -43,7 +43,7 @@ ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 { %p0 = extractelement <4 x i8> %foo, i32 0 %p1 = extractelement <4 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -57,7 +57,7 @@ ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 { %p0 = extractelement <8 x i8> %foo, i32 0 %p1 = extractelement <8 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -71,7 +71,7 @@ ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 { %p0 = extractelement <16 x i8> %foo, i32 0 %p1 = extractelement <16 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -85,7 +85,7 @@ ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 { %p0 = extractelement <32 x i8> %foo, i32 0 %p1 = extractelement <32 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -99,7 +99,7 @@ ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 { %p0 = extractelement <64 x i8> %foo, i32 0 %p1 = extractelement <64 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -120,7 +120,7 @@ ; GCN: buffer_store_byte ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte -define void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 { +define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 { %p0 = extractelement <3 x i8> %foo, i32 %idx %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 %p0, i8 addrspace(1)* %out @@ -141,7 +141,7 @@ ; GCN: buffer_store_byte ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte -define void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 { +define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 { %p0 = extractelement <4 x i8> %foo, i32 %idx %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 %p0, i8 addrspace(1)* %out Index: test/CodeGen/AMDGPU/extractelt-to-trunc.ll =================================================================== --- test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -7,7 +7,7 @@ ; GCN-DAG: buffer_load_dword [[A:v[0-9]+]] ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]] ; GCN: buffer_store_dword [[ADD]] -define void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { +define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { %a = load i64, i64 addrspace(1)* %in %add = add i64 %a, %b %val.bc = bitcast i64 %add to <2 x i32> @@ -20,7 +20,7 @@ ; GCN: buffer_load_dwordx2 ; GCN: v_add_f64 ; GCN: buffer_store_dword v -define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) { +define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) { %a = load double, double addrspace(1)* %in %add = fadd double %a, %b %val.bc = bitcast double %add to <2 x i32> @@ -33,7 +33,7 @@ ; GCN: buffer_load_dwordx2 ; GCN: v_add_i32 ; GCN: buffer_store_dword -define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { +define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { %a = load i64, i64 addrspace(1)* %in %add = add i64 %a, %b %val.bc = bitcast i64 %add to <2 x float> @@ -45,7 +45,7 @@ ; GCN-LABEL: {{^}}no_extract_volatile_load_extract0: ; GCN: buffer_load_dwordx4 ; GCN: buffer_store_dword v -define void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { entry: %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %elt0 = extractelement <4 x i32> %vec, i32 0 @@ -57,7 +57,7 @@ ; GCN: buffer_load_dwordx4 ; GCN: buffer_store_dword v -define void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { entry: %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %elt2 = extractelement <4 x i32> %vec, i32 2 @@ -68,7 +68,7 @@ ; GCN-LABEL: {{^}}no_extract_volatile_load_dynextract: ; GCN: buffer_load_dwordx4 ; GCN: buffer_store_dword v -define void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { +define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { entry: %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %eltN = extractelement <4 x i32> %vec, i32 %idx Index: test/CodeGen/AMDGPU/fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f16.ll +++ test/CodeGen/AMDGPU/fabs.f16.ll @@ -11,7 +11,7 @@ ; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]] ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) { +define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) { %bc= bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) store half %fabs, half addrspace(1)* %out @@ -22,7 +22,7 @@ ; CI: flat_load_ushort [[VAL:v[0-9]+]], ; CI: v_and_b32_e32 [[CVT0:v[0-9]+]], 0x7fff, [[VAL]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @s_fabs_f16(half addrspace(1)* %out, half %in) { +define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) store half %fabs, half addrspace(1)* %out ret void @@ -48,7 +48,7 @@ ; GFX9: s_load_dword [[VAL:s[0-9]+]] ; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff -define void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { +define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) store <2 x half> %fabs, <2 x half> addrspace(1)* %out ret void @@ -68,7 +68,7 @@ ; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; GCN: flat_store_dwordx2 -define void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { +define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) store <4 x half> %fabs, <4 x half> addrspace(1)* %out ret void @@ -87,7 +87,7 @@ ; VI-NOT: and ; VI: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN1]]|, [[IN0]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) { +define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) { %fabs = call half @llvm.fabs.f16(half %in0) %fmul = fmul half %fabs, %in1 store half %fmul, half addrspace(1)* %out @@ -97,7 +97,7 @@ ; GCN-LABEL: {{^}}v_fabs_v2f16: ; GCN: flat_load_dword [[VAL:v[0-9]+]] ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, [[VAL]] -define void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid @@ -110,7 +110,7 @@ ; GCN-LABEL: {{^}}fabs_free_v2f16: ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff -define void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { %bc = bitcast i32 %in to <2 x half> %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc) store <2 x half> %fabs, <2 x half> addrspace(1)* %out @@ -133,7 +133,7 @@ ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]] ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}} -define void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) %fmul = fmul <2 x half> %fabs, %val Index: test/CodeGen/AMDGPU/fabs.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f64.ll +++ test/CodeGen/AMDGPU/fabs.f64.ll @@ -10,7 +10,7 @@ ; FUNC-LABEL: {{^}}v_fabs_f64: ; SI: v_and_b32 ; SI: s_endpgm -define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %tidext = sext i32 %tid to i64 %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext @@ -24,7 +24,7 @@ ; SI: v_and_b32 ; SI-NOT: v_and_b32 ; SI: s_endpgm -define void @fabs_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double %in) { %fabs = call double @llvm.fabs.f64(double %in) store double %fabs, double addrspace(1)* %out ret void @@ -34,7 +34,7 @@ ; SI: v_and_b32 ; SI: v_and_b32 ; SI: s_endpgm -define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) store <2 x double> %fabs, <2 x double> addrspace(1)* %out ret void @@ -46,7 +46,7 @@ ; SI: v_and_b32 ; SI: v_and_b32 ; SI: s_endpgm -define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) store <4 x double> %fabs, <4 x double> addrspace(1)* %out ret void @@ -57,7 +57,7 @@ ; SI-NOT: and ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]| ; SI: s_endpgm -define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { +define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { %fabs = call double @llvm.fabs.f64(double %in0) %fmul = fmul double %fabs, %in1 store double %fmul, double addrspace(1)* %out @@ -69,7 +69,7 @@ ; SI-NOT: and ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]| ; SI: s_endpgm -define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { +define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { %fabs = call double @fabs(double %in0) %fmul = fmul double %fabs, %in1 store double %fmul, double addrspace(1)* %out @@ -79,7 +79,7 @@ ; FUNC-LABEL: {{^}}fabs_free_f64: ; SI: v_and_b32 ; SI: s_endpgm -define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @fabs_free_f64(double addrspace(1)* %out, i64 %in) { %bc= bitcast i64 %in to double %fabs = call double @llvm.fabs.f64(double %bc) store double %fabs, double addrspace(1)* %out @@ -89,7 +89,7 @@ ; FUNC-LABEL: {{^}}fabs_fn_free_f64: ; SI: v_and_b32 ; SI: s_endpgm -define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { %bc= bitcast i64 %in to double %fabs = call double @fabs(double %bc) store double %fabs, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/fabs.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.ll +++ test/CodeGen/AMDGPU/fabs.ll @@ -13,7 +13,7 @@ ; GCN: v_and_b32 -define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @fabs_fn_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float %fabs = call float @fabs(float %bc) store float %fabs, float addrspace(1)* %out @@ -26,7 +26,7 @@ ; GCN: v_and_b32 -define void @fabs_free(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @fabs_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float %fabs = call float @llvm.fabs.f32(float %bc) store float %fabs, float addrspace(1)* %out @@ -37,7 +37,7 @@ ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; GCN: v_and_b32 -define void @fabs_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float %in) { %fabs = call float @llvm.fabs.f32(float %in) store float %fabs, float addrspace(1)* %out ret void @@ -49,7 +49,7 @@ ; GCN: v_and_b32 ; GCN: v_and_b32 -define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) store <2 x float> %fabs, <2 x float> addrspace(1)* %out ret void @@ -65,7 +65,7 @@ ; GCN: v_and_b32 ; GCN: v_and_b32 ; GCN: v_and_b32 -define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) store <4 x float> %fabs, <4 x float> addrspace(1)* %out ret void @@ -76,7 +76,7 @@ ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN-NOT: and ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]| -define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) { +define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) { %fabs = call float @fabs(float %in0) %fmul = fmul float %fabs, %in1 store float %fmul, float addrspace(1)* %out @@ -88,7 +88,7 @@ ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN-NOT: and ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]| -define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) { +define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) { %fabs = call float @llvm.fabs.f32(float %in0) %fmul = fmul float %fabs, %in1 store float %fmul, float addrspace(1)* %out Index: test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll =================================================================== --- test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -28,7 +28,7 @@ ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 -define void @fast_add_fmuladd_fmul() #0 { +define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -55,7 +55,7 @@ ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]] ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]] ; GCN-FASTFMA: buffer_store_dword [[FMA1]] -define void @fast_sub_fmuladd_fmul() #0 { +define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -87,7 +87,7 @@ ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 -define void @fast_add_fmuladd_fmul_multi_use_mul() #0 { +define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -120,7 +120,7 @@ ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 -define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 { +define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -145,7 +145,7 @@ ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 -define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 { +define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -170,7 +170,7 @@ ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 -define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 { +define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -205,7 +205,7 @@ ; GCN: buffer_store_dword [[MUL]] ; GCN: buffer_store_dword [[MAD]] -define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 { +define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -241,7 +241,7 @@ ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]] ; GCN-SLOWFMA: v_add_f32_e32 ; GCN-SLOWFMA: v_subrev_f32_e32 -define void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 { +define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef Index: test/CodeGen/AMDGPU/fadd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fadd.f16.ll +++ test/CodeGen/AMDGPU/fadd.f16.ll @@ -11,7 +11,7 @@ ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fadd_f16( +define amdgpu_kernel void @fadd_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -31,7 +31,7 @@ ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fadd_f16_imm_a( +define amdgpu_kernel void @fadd_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -49,7 +49,7 @@ ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fadd_f16_imm_b( +define amdgpu_kernel void @fadd_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -83,7 +83,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fadd_v2f16( +define amdgpu_kernel void @fadd_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -111,7 +111,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fadd_v2f16_imm_a( +define amdgpu_kernel void @fadd_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: @@ -136,7 +136,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fadd_v2f16_imm_b( +define amdgpu_kernel void @fadd_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/fadd.ll =================================================================== --- test/CodeGen/AMDGPU/fadd.ll +++ test/CodeGen/AMDGPU/fadd.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}fadd_f32: ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W ; SI: v_add_f32 -define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float %a, float %b) #0 { %add = fadd float %a, %b store float %add, float addrspace(1)* %out, align 4 ret void @@ -16,7 +16,7 @@ ; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y ; SI: v_add_f32 ; SI: v_add_f32 -define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { +define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { %add = fadd <2 x float> %a, %b store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8 ret void @@ -31,7 +31,7 @@ ; SI: v_add_f32 ; SI: v_add_f32 ; SI: v_add_f32 -define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16 @@ -57,7 +57,7 @@ ; SI: v_add_f32 ; SI: v_add_f32 ; SI: v_add_f32 -define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 { +define amdgpu_kernel void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 { %add = fadd <8 x float> %a, %b store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32 ret void @@ -65,7 +65,7 @@ ; FUNC-LABEL: {{^}}fadd_0_nsz_attr_f32: ; SI-NOT: v_add_f32 -define void @fadd_0_nsz_attr_f32(float addrspace(1)* %out, float %a) #1 { +define amdgpu_kernel void @fadd_0_nsz_attr_f32(float addrspace(1)* %out, float %a) #1 { %add = fadd float %a, 0.0 store float %add, float addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/fadd64.ll =================================================================== --- test/CodeGen/AMDGPU/fadd64.ll +++ test/CodeGen/AMDGPU/fadd64.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: {{^}}v_fadd_f64: ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}} -define void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -14,7 +14,7 @@ ; CHECK-LABEL: {{^}}s_fadd_f64: ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) { +define amdgpu_kernel void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) { %r2 = fadd double %r0, %r1 store double %r2, double addrspace(1)* %out ret void @@ -24,7 +24,7 @@ ; CHECK: v_add_f64 ; CHECK: v_add_f64 ; CHECK: _store_dwordx4 -define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, +define amdgpu_kernel void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, <2 x double> addrspace(1)* %in2) { %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 @@ -37,7 +37,7 @@ ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} ; CHECK: _store_dwordx4 -define void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) { +define amdgpu_kernel void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) { %r2 = fadd <2 x double> %r0, %r1 store <2 x double> %r2, <2 x double> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/fcanonicalize.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -9,7 +9,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16: ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} ; GCN: buffer_store_short [[REG]] -define void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out %canonicalized = call half @llvm.canonicalize.f16(half %val) store half %canonicalized, half addrspace(1)* %out @@ -19,7 +19,7 @@ ; GCN-LABEL: {{^}}s_test_canonicalize_var_f16: ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}} ; GCN: buffer_store_short [[REG]] -define void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 { +define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 { %val = bitcast i16 %val.arg to half %canonicalized = call half @llvm.canonicalize.f16(half %val) store half %canonicalized, half addrspace(1)* %out @@ -29,7 +29,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16: ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}| ; GCN: buffer_store_short [[REG]] -define void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out %val.fabs = call half @llvm.fabs.f16(half %val) %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs) @@ -40,7 +40,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16: ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}| ; GCN: buffer_store_short [[REG]] -define void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out %val.fabs = call half @llvm.fabs.f16(half %val) %val.fabs.fneg = fsub half -0.0, %val.fabs @@ -52,7 +52,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16: ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}} ; GCN: buffer_store_short [[REG]] -define void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out %val.fneg = fsub half -0.0, %val %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg) @@ -63,7 +63,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0.0) store half %canonicalized, half addrspace(1)* %out ret void @@ -72,7 +72,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half -0.0) store half %canonicalized, half addrspace(1)* %out ret void @@ -81,7 +81,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 1.0) store half %canonicalized, half addrspace(1)* %out ret void @@ -90,7 +90,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half -1.0) store half %canonicalized, half addrspace(1)* %out ret void @@ -99,7 +99,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 16.0) store half %canonicalized, half addrspace(1)* %out ret void @@ -108,7 +108,7 @@ ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, half addrspace(1)* %out ret void @@ -117,7 +117,7 @@ ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, half addrspace(1)* %out ret void @@ -126,7 +126,7 @@ ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, half addrspace(1)* %out ret void @@ -135,7 +135,7 @@ ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, half addrspace(1)* %out ret void @@ -144,7 +144,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00) store half %canonicalized, half addrspace(1)* %out ret void @@ -153,7 +153,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half)) store half %canonicalized, half addrspace(1)* %out ret void @@ -162,7 +162,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half)) store half %canonicalized, half addrspace(1)* %out ret void @@ -171,7 +171,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01) store half %canonicalized, half addrspace(1)* %out ret void @@ -180,7 +180,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF) store half %canonicalized, half addrspace(1)* %out ret void @@ -189,7 +189,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF) store half %canonicalized, half addrspace(1)* %out ret void @@ -198,7 +198,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01) store half %canonicalized, half addrspace(1)* %out ret void @@ -211,7 +211,7 @@ ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}} ; GFX9: buffer_store_dword [[REG]] -define void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %val = load <2 x half>, <2 x half> addrspace(1)* %out %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -229,7 +229,7 @@ ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}} ; GCN: buffer_store_dword -define void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %val = load <2 x half>, <2 x half> addrspace(1)* %out %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs) @@ -246,7 +246,7 @@ ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}} ; GCN: buffer_store_dword -define void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %val = load <2 x half>, <2 x half> addrspace(1)* %out %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) %val.fabs.fneg = fsub <2 x half> , %val.fabs @@ -265,7 +265,7 @@ ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}} ; GFX9: buffer_store_dword [[REG]] -define void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %val = load <2 x half>, <2 x half> addrspace(1)* %out %fneg.val = fsub <2 x half> , %val %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val) @@ -280,7 +280,7 @@ ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}} ; GFX9: buffer_store_dword [[REG]] -define void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 { +define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 { %val = bitcast i32 %val.arg to <2 x half> %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -290,7 +290,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -299,7 +299,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -308,7 +308,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -317,7 +317,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -326,7 +326,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -335,7 +335,7 @@ ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -344,7 +344,7 @@ ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -353,7 +353,7 @@ ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -362,7 +362,7 @@ ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -371,7 +371,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -380,7 +380,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>)) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -389,7 +389,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -398,7 +398,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -407,7 +407,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -416,7 +416,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -425,7 +425,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/fcanonicalize.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize.ll +++ test/CodeGen/AMDGPU/fcanonicalize.ll @@ -8,7 +8,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_var_f32: ; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} ; GCN: buffer_store_dword [[REG]] -define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %canonicalized = call float @llvm.canonicalize.f32(float %val) store float %canonicalized, float addrspace(1)* %out @@ -18,7 +18,7 @@ ; GCN-LABEL: {{^}}s_test_canonicalize_var_f32: ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}} ; GCN: buffer_store_dword [[REG]] -define void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 { +define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 { %canonicalized = call float @llvm.canonicalize.f32(float %val) store float %canonicalized, float addrspace(1)* %out ret void @@ -27,7 +27,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f32: ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}| ; GCN: buffer_store_dword [[REG]] -define void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %val.fabs = call float @llvm.fabs.f32(float %val) %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs) @@ -38,7 +38,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32: ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}| ; GCN: buffer_store_dword [[REG]] -define void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %val.fabs = call float @llvm.fabs.f32(float %val) %val.fabs.fneg = fsub float -0.0, %val.fabs @@ -50,7 +50,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32: ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}} ; GCN: buffer_store_dword [[REG]] -define void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %val.fneg = fsub float -0.0, %val %canonicalized = call float @llvm.canonicalize.f32(float %val.fneg) @@ -61,7 +61,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 0.0) store float %canonicalized, float addrspace(1)* %out ret void @@ -70,7 +70,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32: ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float -0.0) store float %canonicalized, float addrspace(1)* %out ret void @@ -79,7 +79,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 1.0) store float %canonicalized, float addrspace(1)* %out ret void @@ -88,7 +88,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float -1.0) store float %canonicalized, float addrspace(1)* %out ret void @@ -97,7 +97,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 16.0) store float %canonicalized, float addrspace(1)* %out ret void @@ -106,7 +106,7 @@ ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -115,7 +115,7 @@ ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -124,7 +124,7 @@ ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -133,7 +133,7 @@ ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -142,7 +142,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000) store float %canonicalized, float addrspace(1)* %out ret void @@ -151,7 +151,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -160,7 +160,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -169,7 +169,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -178,7 +178,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -187,7 +187,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -196,7 +196,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -205,7 +205,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_var_f64: ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{v\[[0-9]+:[0-9]+\]}} ; GCN: buffer_store_dwordx2 [[REG]] -define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %canonicalized = call double @llvm.canonicalize.f64(double %val) store double %canonicalized, double addrspace(1)* %out @@ -215,7 +215,7 @@ ; GCN-LABEL: {{^}}s_test_canonicalize_var_f64: ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{s\[[0-9]+:[0-9]+\]}} ; GCN: buffer_store_dwordx2 [[REG]] -define void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 { +define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 { %canonicalized = call double @llvm.canonicalize.f64(double %val) store double %canonicalized, double addrspace(1)* %out ret void @@ -224,7 +224,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64: ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, |{{v\[[0-9]+:[0-9]+\]}}| ; GCN: buffer_store_dwordx2 [[REG]] -define void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %val.fabs = call double @llvm.fabs.f64(double %val) %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs) @@ -235,7 +235,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64: ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]\]]], 1.0, -|{{v\[[0-9]+:[0-9]+\]}}| ; GCN: buffer_store_dwordx2 [[REG]] -define void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %val.fabs = call double @llvm.fabs.f64(double %val) %val.fabs.fneg = fsub double -0.0, %val.fabs @@ -247,7 +247,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64: ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, -{{v\[[0-9]+:[0-9]+\]}} ; GCN: buffer_store_dwordx2 [[REG]] -define void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %val.fneg = fsub double -0.0, %val %canonicalized = call double @llvm.canonicalize.f64(double %val.fneg) @@ -259,7 +259,7 @@ ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 0.0) store double %canonicalized, double addrspace(1)* %out ret void @@ -269,7 +269,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double -0.0) store double %canonicalized, double addrspace(1)* %out ret void @@ -279,7 +279,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 1.0) store double %canonicalized, double addrspace(1)* %out ret void @@ -289,7 +289,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double -1.0) store double %canonicalized, double addrspace(1)* %out ret void @@ -299,7 +299,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 16.0) store double %canonicalized, double addrspace(1)* %out ret void @@ -309,7 +309,7 @@ ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 { +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -319,7 +319,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -329,7 +329,7 @@ ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 { +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -339,7 +339,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -349,7 +349,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000) store double %canonicalized, double addrspace(1)* %out ret void @@ -359,7 +359,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -369,7 +369,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -379,7 +379,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -389,7 +389,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -399,7 +399,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -409,7 +409,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double)) store double %canonicalized, double addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/fceil.ll =================================================================== --- test/CodeGen/AMDGPU/fceil.ll +++ test/CodeGen/AMDGPU/fceil.ll @@ -13,7 +13,7 @@ ; SI: v_ceil_f32_e32 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: CEIL {{\*? *}}[[RESULT]] -define void @fceil_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @fceil_f32(float addrspace(1)* %out, float %x) { %y = call float @llvm.ceil.f32(float %x) nounwind readnone store float %y, float addrspace(1)* %out ret void @@ -25,7 +25,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} ; EG: CEIL {{\*? *}}[[RESULT]] ; EG: CEIL {{\*? *}}[[RESULT]] -define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { +define amdgpu_kernel void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone store <2 x float> %y, <2 x float> addrspace(1)* %out ret void @@ -41,7 +41,7 @@ ; EG-DAG: CEIL {{\*? *}}[[RESULT1]] ; EG-DAG: CEIL {{\*? *}}[[RESULT2]] ; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { +define amdgpu_kernel void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone store <3 x float> %y, <3 x float> addrspace(1)* %out ret void @@ -57,7 +57,7 @@ ; EG: CEIL {{\*? *}}[[RESULT]] ; EG: CEIL {{\*? *}}[[RESULT]] ; EG: CEIL {{\*? *}}[[RESULT]] -define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { +define amdgpu_kernel void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone store <4 x float> %y, <4 x float> addrspace(1)* %out ret void @@ -82,7 +82,7 @@ ; EG-DAG: CEIL {{\*? *}}[[RESULT2]] ; EG-DAG: CEIL {{\*? *}}[[RESULT2]] ; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { +define amdgpu_kernel void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone store <8 x float> %y, <8 x float> addrspace(1)* %out ret void @@ -125,7 +125,7 @@ ; EG-DAG: CEIL {{\*? *}}[[RESULT4]] ; EG-DAG: CEIL {{\*? *}}[[RESULT4]] ; EG-DAG: CEIL {{\*? *}}[[RESULT4]] -define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { +define amdgpu_kernel void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone store <16 x float> %y, <16 x float> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/fceil64.ll =================================================================== --- test/CodeGen/AMDGPU/fceil64.ll +++ test/CodeGen/AMDGPU/fceil64.ll @@ -31,7 +31,7 @@ ; SI: v_cndmask_b32 ; SI: v_add_f64 ; SI: s_endpgm -define void @fceil_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @fceil_f64(double addrspace(1)* %out, double %x) { %y = call double @llvm.ceil.f64(double %x) nounwind readnone store double %y, double addrspace(1)* %out ret void @@ -40,7 +40,7 @@ ; FUNC-LABEL: {{^}}fceil_v2f64: ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 -define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { +define amdgpu_kernel void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone store <2 x double> %y, <2 x double> addrspace(1)* %out ret void @@ -50,7 +50,7 @@ ; FIXME-CI: v_ceil_f64_e32 ; FIXME-CI: v_ceil_f64_e32 ; FIXME-CI: v_ceil_f64_e32 -; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { +; define amdgpu_kernel void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { ; %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone ; store <3 x double> %y, <3 x double> addrspace(1)* %out ; ret void @@ -61,7 +61,7 @@ ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 -define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { +define amdgpu_kernel void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone store <4 x double> %y, <4 x double> addrspace(1)* %out ret void @@ -76,7 +76,7 @@ ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 -define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { +define amdgpu_kernel void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone store <8 x double> %y, <8 x double> addrspace(1)* %out ret void @@ -99,7 +99,7 @@ ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 -define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { +define amdgpu_kernel void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone store <16 x double> %y, <16 x double> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/fcmp-cnd.ll =================================================================== --- test/CodeGen/AMDGPU/fcmp-cnd.ll +++ test/CodeGen/AMDGPU/fcmp-cnd.ll @@ -4,7 +4,7 @@ ;registers and literal.x depending on what the optimizer does. ;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { entry: %0 = load float, float addrspace(1)* %in %cmp = fcmp oeq float %0, 0.000000e+00 Index: test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll =================================================================== --- test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll +++ test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll @@ -6,7 +6,7 @@ ; CHECK: SET{{[A-Z]+}}_DX10 -define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { entry: %0 = load float, float addrspace(1)* %in %cmp = fcmp oeq float %0, 0.000000e+00 Index: test/CodeGen/AMDGPU/fcmp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcmp.f16.ll +++ test/CodeGen/AMDGPU/fcmp.f16.ll @@ -11,7 +11,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_lt( +define amdgpu_kernel void @fcmp_f16_lt( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -37,7 +37,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_lt_abs( +define amdgpu_kernel void @fcmp_f16_lt_abs( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -62,7 +62,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_eq( +define amdgpu_kernel void @fcmp_f16_eq( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -85,7 +85,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_le( +define amdgpu_kernel void @fcmp_f16_le( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -108,7 +108,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_gt( +define amdgpu_kernel void @fcmp_f16_gt( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -131,7 +131,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_lg( +define amdgpu_kernel void @fcmp_f16_lg( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -154,7 +154,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_ge( +define amdgpu_kernel void @fcmp_f16_ge( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -177,7 +177,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_o( +define amdgpu_kernel void @fcmp_f16_o( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -200,7 +200,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_u( +define amdgpu_kernel void @fcmp_f16_u( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -223,7 +223,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_nge( +define amdgpu_kernel void @fcmp_f16_nge( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -246,7 +246,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_nlg( +define amdgpu_kernel void @fcmp_f16_nlg( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -269,7 +269,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_ngt( +define amdgpu_kernel void @fcmp_f16_ngt( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -292,7 +292,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_nle( +define amdgpu_kernel void @fcmp_f16_nle( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -315,7 +315,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_neq( +define amdgpu_kernel void @fcmp_f16_neq( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -338,7 +338,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_nlt( +define amdgpu_kernel void @fcmp_f16_nlt( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -368,7 +368,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_lt( +define amdgpu_kernel void @fcmp_v2f16_lt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -398,7 +398,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_eq( +define amdgpu_kernel void @fcmp_v2f16_eq( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -428,7 +428,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_le( +define amdgpu_kernel void @fcmp_v2f16_le( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -458,7 +458,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_gt( +define amdgpu_kernel void @fcmp_v2f16_gt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -488,7 +488,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_lg( +define amdgpu_kernel void @fcmp_v2f16_lg( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -518,7 +518,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_ge( +define amdgpu_kernel void @fcmp_v2f16_ge( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -548,7 +548,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_o( +define amdgpu_kernel void @fcmp_v2f16_o( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -578,7 +578,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_u( +define amdgpu_kernel void @fcmp_v2f16_u( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -608,7 +608,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_nge( +define amdgpu_kernel void @fcmp_v2f16_nge( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -638,7 +638,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_nlg( +define amdgpu_kernel void @fcmp_v2f16_nlg( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -668,7 +668,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_ngt( +define amdgpu_kernel void @fcmp_v2f16_ngt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -698,7 +698,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_nle( +define amdgpu_kernel void @fcmp_v2f16_nle( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -728,7 +728,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_neq( +define amdgpu_kernel void @fcmp_v2f16_neq( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -758,7 +758,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_nlt( +define amdgpu_kernel void @fcmp_v2f16_nlt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { Index: test/CodeGen/AMDGPU/fcmp.ll =================================================================== --- test/CodeGen/AMDGPU/fcmp.ll +++ test/CodeGen/AMDGPU/fcmp.ll @@ -3,7 +3,7 @@ ; CHECK: {{^}}fcmp_sext: ; CHECK: SETE_DX10 T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) { entry: %0 = load float, float addrspace(1)* %in %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1 @@ -22,7 +22,7 @@ ; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}} ; CHECK-NEXT: {{[0-9]+\(5.0}} -define void @fcmp_br(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_br(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp oeq float %in, 5.0 br i1 %0, label %IF, label %ENDIF Index: test/CodeGen/AMDGPU/fcmp64.ll =================================================================== --- test/CodeGen/AMDGPU/fcmp64.ll +++ test/CodeGen/AMDGPU/fcmp64.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: {{^}}flt_f64: ; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -15,7 +15,7 @@ ; CHECK-LABEL: {{^}}fle_f64: ; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -27,7 +27,7 @@ ; CHECK-LABEL: {{^}}fgt_f64: ; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -39,7 +39,7 @@ ; CHECK-LABEL: {{^}}fge_f64: ; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -51,7 +51,7 @@ ; CHECK-LABEL: {{^}}fne_f64: ; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -63,7 +63,7 @@ ; CHECK-LABEL: {{^}}feq_f64: ; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 Index: test/CodeGen/AMDGPU/fconst64.ll =================================================================== --- test/CodeGen/AMDGPU/fconst64.ll +++ test/CodeGen/AMDGPU/fconst64.ll @@ -5,7 +5,7 @@ ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0 -define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) { %r1 = load double, double addrspace(1)* %in %r2 = fadd double %r1, 5.000000e+00 store double %r2, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/fcopysign.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcopysign.f16.ll +++ test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -23,7 +23,7 @@ ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]] ; GCN: buffer_store_short v[[OUT]] ; GCN: s_endpgm -define void @test_copysign_f16( +define amdgpu_kernel void @test_copysign_f16( half addrspace(1)* %arg_out, half addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { @@ -43,7 +43,7 @@ ; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]] ; GCN: buffer_store_dword v[[OUT]] ; GCN: s_endpgm -define void @test_copysign_out_f32_mag_f16_sign_f32( +define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32( float addrspace(1)* %arg_out, half addrspace(1)* %arg_mag, float addrspace(1)* %arg_sign) { @@ -65,7 +65,7 @@ ; GCN: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_EXT_HI]], v[[SIGN_HI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_EXT_LO]]:[[OUT_HI]]{{\]}} ; GCN: s_endpgm -define void @test_copysign_out_f64_mag_f16_sign_f64( +define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64( double addrspace(1)* %arg_out, half addrspace(1)* %arg_mag, double addrspace(1)* %arg_sign) { @@ -88,7 +88,7 @@ ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]] ; GCN: buffer_store_dword v[[OUT]] ; GCN: s_endpgm -define void @test_copysign_out_f32_mag_f32_sign_f16( +define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16( float addrspace(1)* %arg_out, float addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { @@ -111,7 +111,7 @@ ; VI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_SHIFT]] ; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_LO]]:[[OUT_HI]]{{\]}} ; GCN: s_endpgm -define void @test_copysign_out_f64_mag_f64_sign_f16( +define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16( double addrspace(1)* %arg_out, double addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { @@ -136,7 +136,7 @@ ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]] ; GCN: buffer_store_short v[[OUT]] ; GCN: s_endpgm -define void @test_copysign_out_f16_mag_f16_sign_f32( +define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32( half addrspace(1)* %arg_out, half addrspace(1)* %arg_mag, float addrspace(1)* %arg_sign) { @@ -161,7 +161,7 @@ ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]] ; GCN: buffer_store_short v[[OUT]] ; GCN: s_endpgm -define void @test_copysign_out_f16_mag_f16_sign_f64( +define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64( half addrspace(1)* %arg_out, half addrspace(1)* %arg_mag, double addrspace(1)* %arg_sign) { @@ -188,7 +188,7 @@ ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_TRUNC]], v[[SIGN]] ; GCN: buffer_store_short v[[OUT]] ; GCN: s_endpgm -define void @test_copysign_out_f16_mag_f32_sign_f16( +define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16( half addrspace(1)* %arg_out, float addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { @@ -204,7 +204,7 @@ ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f64_sign_f16: ; GCN: v_bfi_b32 ; GCN: s_endpgm -define void @test_copysign_out_f16_mag_f64_sign_f16( +define amdgpu_kernel void @test_copysign_out_f16_mag_f64_sign_f16( half addrspace(1)* %arg_out, double addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { @@ -221,7 +221,7 @@ ; GCN: v_bfi_b32 ; GCN: v_bfi_b32 ; GCN: s_endpgm -define void @test_copysign_v2f16( +define amdgpu_kernel void @test_copysign_v2f16( <2 x half> addrspace(1)* %arg_out, <2 x half> %arg_mag, <2 x half> %arg_sign) { @@ -236,7 +236,7 @@ ; GCN: v_bfi_b32 ; GCN: v_bfi_b32 ; GCN: s_endpgm -define void @test_copysign_v3f16( +define amdgpu_kernel void @test_copysign_v3f16( <3 x half> addrspace(1)* %arg_out, <3 x half> %arg_mag, <3 x half> %arg_sign) { @@ -252,7 +252,7 @@ ; GCN: v_bfi_b32 ; GCN: v_bfi_b32 ; GCN: s_endpgm -define void @test_copysign_v4f16( +define amdgpu_kernel void @test_copysign_v4f16( <4 x half> addrspace(1)* %arg_out, <4 x half> %arg_mag, <4 x half> %arg_sign) { Index: test/CodeGen/AMDGPU/fcopysign.f32.ll =================================================================== --- test/CodeGen/AMDGPU/fcopysign.f32.ll +++ test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -20,7 +20,7 @@ ; GCN: s_endpgm ; EG: BFI_INT -define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind { +define amdgpu_kernel void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind { %result = call float @llvm.copysign.f32(float %mag, float %sign) store float %result, float addrspace(1)* %out, align 4 ret void @@ -31,7 +31,7 @@ ; EG: BFI_INT ; EG: BFI_INT -define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind { +define amdgpu_kernel void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind { %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8 ret void @@ -44,7 +44,7 @@ ; EG: BFI_INT ; EG: BFI_INT ; EG: BFI_INT -define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind { +define amdgpu_kernel void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind { %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign) store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 ret void Index: test/CodeGen/AMDGPU/fcopysign.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fcopysign.f64.ll +++ test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -17,7 +17,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}} ; GCN: s_endpgm -define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind { +define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind { %result = call double @llvm.copysign.f64(double %mag, double %sign) store double %result, double addrspace(1)* %out, align 8 ret void @@ -32,7 +32,7 @@ ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]] ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}} -define void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind { +define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind { %c = fpext float %sign to double %result = call double @llvm.copysign.f64(double %mag, double %c) store double %result, double addrspace(1)* %out, align 8 @@ -41,7 +41,7 @@ ; FUNC-LABEL: {{^}}test_copysign_v2f64: ; GCN: s_endpgm -define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind { +define amdgpu_kernel void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind { %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign) store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8 ret void @@ -49,7 +49,7 @@ ; FUNC-LABEL: {{^}}test_copysign_v4f64: ; GCN: s_endpgm -define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind { +define amdgpu_kernel void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind { %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign) store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8 ret void Index: test/CodeGen/AMDGPU/fdiv.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fdiv.f16.ll +++ test/CodeGen/AMDGPU/fdiv.f16.ll @@ -31,7 +31,7 @@ ; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]] ; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_fdiv_f16( +define amdgpu_kernel void @v_fdiv_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { @@ -54,7 +54,7 @@ ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] ; VI-NOT: [[RESULT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -72,7 +72,7 @@ ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]| ; VI-NOT: [RESULT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -91,7 +91,7 @@ ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] ; VI-NOT: [[RESULT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -109,7 +109,7 @@ ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]] ; VI-NOT: [RESULT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -127,7 +127,7 @@ ; VI: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] ; VI-NOT: [RESULT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -147,7 +147,7 @@ ; VI-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]] ; VI-NOT: [RESULT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -168,7 +168,7 @@ ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -190,7 +190,7 @@ ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 { +define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -209,7 +209,7 @@ ; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}} ; VI: buffer_store_short [[MUL]] -define void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 { +define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 { %x = load half, half addrspace(1)* undef %rcp = fdiv arcp half %x, 2.0 store half %rcp, half addrspace(1)* %out, align 4 @@ -221,7 +221,7 @@ ; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}} ; VI: buffer_store_short [[MUL]] -define void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 { +define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 { %x = load half, half addrspace(1)* undef %rcp = fdiv arcp half %x, 10.0 store half %rcp, half addrspace(1)* %out, align 4 @@ -233,7 +233,7 @@ ; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}} ; VI: buffer_store_short [[MUL]] -define void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 { +define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 { %x = load half, half addrspace(1)* undef %rcp = fdiv arcp half %x, -10.0 store half %rcp, half addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/fdiv.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fdiv.f64.ll +++ test/CodeGen/AMDGPU/fdiv.f64.ll @@ -29,7 +29,7 @@ ; GCN: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]] ; GCN: buffer_store_dwordx2 [[RESULT]] ; GCN: s_endpgm -define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1 %num = load volatile double, double addrspace(1)* %in %den = load volatile double, double addrspace(1)* %gep.1 @@ -39,7 +39,7 @@ } ; GCN-LABEL: {{^}}fdiv_f64_s_v: -define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 { +define amdgpu_kernel void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 { %den = load double, double addrspace(1)* %in %result = fdiv double %num, %den store double %result, double addrspace(1)* %out @@ -47,7 +47,7 @@ } ; GCN-LABEL: {{^}}fdiv_f64_v_s: -define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) #0 { +define amdgpu_kernel void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) #0 { %num = load double, double addrspace(1)* %in %result = fdiv double %num, %den store double %result, double addrspace(1)* %out @@ -55,14 +55,14 @@ } ; GCN-LABEL: {{^}}fdiv_f64_s_s: -define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) #0 { +define amdgpu_kernel void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) #0 { %result = fdiv double %num, %den store double %result, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}v_fdiv_v2f64: -define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1 %num = load <2 x double>, <2 x double> addrspace(1)* %in %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1 @@ -72,14 +72,14 @@ } ; GCN-LABEL: {{^}}s_fdiv_v2f64: -define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) { +define amdgpu_kernel void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) { %result = fdiv <2 x double> %num, %den store <2 x double> %result, <2 x double> addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}v_fdiv_v4f64: -define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1 %num = load <4 x double>, <4 x double> addrspace(1)* %in %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1 @@ -89,7 +89,7 @@ } ; GCN-LABEL: {{^}}s_fdiv_v4f64: -define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) #0 { +define amdgpu_kernel void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) #0 { %result = fdiv <4 x double> %num, %den store <4 x double> %result, <4 x double> addrspace(1)* %out ret void @@ -98,7 +98,7 @@ ; GCN-LABEL: {{^}}div_fast_2_x_pat_f64: ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 0.5 ; GCN: buffer_store_dwordx2 [[MUL]] -define void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 { %x = load double, double addrspace(1)* undef %rcp = fdiv fast double %x, 2.0 store double %rcp, double addrspace(1)* %out, align 4 @@ -110,7 +110,7 @@ ; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fb99999 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} ; GCN: buffer_store_dwordx2 [[MUL]] -define void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 { %x = load double, double addrspace(1)* undef %rcp = fdiv fast double %x, 10.0 store double %rcp, double addrspace(1)* %out, align 4 @@ -122,7 +122,7 @@ ; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfb99999 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} ; GCN: buffer_store_dwordx2 [[MUL]] -define void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 { %x = load double, double addrspace(1)* undef %rcp = fdiv fast double %x, -10.0 store double %rcp, double addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/fdiv.ll =================================================================== --- test/CodeGen/AMDGPU/fdiv.ll +++ test/CodeGen/AMDGPU/fdiv.ll @@ -27,7 +27,7 @@ ; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], -define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { entry: %fdiv = fdiv float %a, %b store float %fdiv, float addrspace(1)* %out @@ -52,7 +52,7 @@ ; GCN-NOT: s_setreg ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], -define void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { +define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { entry: %fdiv = fdiv float %a, %b store float %fdiv, float addrspace(1)* %out @@ -65,7 +65,7 @@ ; GCN: v_rcp_f32 ; GCN: v_mul_f32 ; GCN: v_mul_f32 -define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 { entry: %fdiv = fdiv float %a, %b, !fpmath !0 store float %fdiv, float addrspace(1)* %out @@ -77,7 +77,7 @@ ; GCN: v_fma_f32 ; GCN: v_div_fmas_f32 ; GCN: v_div_fixup_f32 -define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { +define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: %fdiv = fdiv float %a, %b, !fpmath !0 store float %fdiv, float addrspace(1)* %out @@ -89,7 +89,7 @@ ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { +define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: %fdiv = fdiv fast float %a, %b store float %fdiv, float addrspace(1)* %out @@ -104,7 +104,7 @@ ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { entry: %fdiv = fdiv fast float %a, %b store float %fdiv, float addrspace(1)* %out @@ -119,7 +119,7 @@ ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 { entry: %fdiv = fdiv arcp float %a, %b store float %fdiv, float addrspace(1)* %out @@ -136,7 +136,7 @@ ; GCN: v_div_scale_f32 ; GCN: v_div_scale_f32 ; GCN: v_div_scale_f32 -define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { +define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv <2 x float> %a, %b store <2 x float> %fdiv, <2 x float> addrspace(1)* %out @@ -146,7 +146,7 @@ ; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32: ; GCN: v_cmp_gt_f32 ; GCN: v_cmp_gt_f32 -define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { +define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out @@ -161,7 +161,7 @@ ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 -define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { +define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv fast <2 x float> %a, %b store <2 x float> %fdiv, <2 x float> addrspace(1)* %out @@ -176,7 +176,7 @@ ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 -define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { +define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv arcp <2 x float> %a, %b store <2 x float> %fdiv, <2 x float> addrspace(1)* %out @@ -197,7 +197,7 @@ ; GCN: v_div_fixup_f32 ; GCN: v_div_fixup_f32 ; GCN: v_div_fixup_f32 -define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr @@ -220,7 +220,7 @@ ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 -define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr @@ -243,7 +243,7 @@ ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 -define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr Index: test/CodeGen/AMDGPU/ffloor.f64.ll =================================================================== --- test/CodeGen/AMDGPU/ffloor.f64.ll +++ test/CodeGen/AMDGPU/ffloor.f64.ll @@ -19,7 +19,7 @@ ; SI: v_cndmask_b32_e32 ; SI: v_add_f64 ; SI: s_endpgm -define void @ffloor_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @ffloor_f64(double addrspace(1)* %out, double %x) { %y = call double @llvm.floor.f64(double %x) nounwind readnone store double %y, double addrspace(1)* %out ret void @@ -34,7 +34,7 @@ ; SI: v_cndmask_b32_e32 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]] ; SI: s_endpgm -define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @ffloor_f64_neg(double addrspace(1)* %out, double %x) { %neg = fsub double 0.0, %x %y = call double @llvm.floor.f64(double %neg) nounwind readnone store double %y, double addrspace(1)* %out @@ -50,7 +50,7 @@ ; SI: v_cndmask_b32_e32 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]| ; SI: s_endpgm -define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) { %abs = call double @llvm.fabs.f64(double %x) %neg = fsub double 0.0, %abs %y = call double @llvm.floor.f64(double %neg) nounwind readnone @@ -61,7 +61,7 @@ ; FUNC-LABEL: {{^}}ffloor_v2f64: ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 -define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { +define amdgpu_kernel void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone store <2 x double> %y, <2 x double> addrspace(1)* %out ret void @@ -72,7 +72,7 @@ ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 ; CI-NOT: v_floor_f64_e32 -define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { +define amdgpu_kernel void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone store <3 x double> %y, <3 x double> addrspace(1)* %out ret void @@ -83,7 +83,7 @@ ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 -define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { +define amdgpu_kernel void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone store <4 x double> %y, <4 x double> addrspace(1)* %out ret void @@ -98,7 +98,7 @@ ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 -define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { +define amdgpu_kernel void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone store <8 x double> %y, <8 x double> addrspace(1)* %out ret void @@ -121,7 +121,7 @@ ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 -define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { +define amdgpu_kernel void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone store <16 x double> %y, <16 x double> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/ffloor.ll =================================================================== --- test/CodeGen/AMDGPU/ffloor.ll +++ test/CodeGen/AMDGPU/ffloor.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}floor_f32: ; SI: v_floor_f32_e32 ; R600: FLOOR -define void @floor_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @floor_f32(float addrspace(1)* %out, float %in) { %tmp = call float @llvm.floor.f32(float %in) #0 store float %tmp, float addrspace(1)* %out ret void @@ -15,7 +15,7 @@ ; SI: v_floor_f32_e32 ; SI: v_floor_f32_e32 -define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0 store <2 x float> %tmp, <2 x float> addrspace(1)* %out ret void @@ -31,7 +31,7 @@ ; R600: FLOOR ; R600: FLOOR ; R600: FLOOR -define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0 store <4 x float> %tmp, <4 x float> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/flat-address-space.ll =================================================================== --- test/CodeGen/AMDGPU/flat-address-space.ll +++ test/CodeGen/AMDGPU/flat-address-space.ll @@ -17,7 +17,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] ; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] ; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]] -define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { +define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* store volatile i32 %x, i32 addrspace(4)* %fptr, align 4 ret void @@ -25,7 +25,7 @@ ; CHECK-LABEL: {{^}}store_flat_i64: ; CHECK: flat_store_dwordx2 -define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 { +define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 { %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* store volatile i64 %x, i64 addrspace(4)* %fptr, align 8 ret void @@ -33,7 +33,7 @@ ; CHECK-LABEL: {{^}}store_flat_v4i32: ; CHECK: flat_store_dwordx4 -define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 { +define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 { %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* store volatile <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16 ret void @@ -41,7 +41,7 @@ ; CHECK-LABEL: {{^}}store_flat_trunc_i16: ; CHECK: flat_store_short -define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 { +define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 { %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* %y = trunc i32 %x to i16 store volatile i16 %y, i16 addrspace(4)* %fptr, align 2 @@ -50,7 +50,7 @@ ; CHECK-LABEL: {{^}}store_flat_trunc_i8: ; CHECK: flat_store_byte -define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { +define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* %y = trunc i32 %x to i8 store volatile i8 %y, i8 addrspace(4)* %fptr, align 2 @@ -61,7 +61,7 @@ ; CHECK-LABEL: load_flat_i32: ; CHECK: flat_load_dword -define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* %fload = load volatile i32, i32 addrspace(4)* %fptr, align 4 store i32 %fload, i32 addrspace(1)* %out, align 4 @@ -70,7 +70,7 @@ ; CHECK-LABEL: load_flat_i64: ; CHECK: flat_load_dwordx2 -define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* %fload = load volatile i64, i64 addrspace(4)* %fptr, align 8 store i64 %fload, i64 addrspace(1)* %out, align 8 @@ -79,7 +79,7 @@ ; CHECK-LABEL: load_flat_v4i32: ; CHECK: flat_load_dwordx4 -define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* %fload = load volatile <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32 store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8 @@ -88,7 +88,7 @@ ; CHECK-LABEL: sextload_flat_i8: ; CHECK: flat_load_sbyte -define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4 %ext = sext i8 %fload to i32 @@ -98,7 +98,7 @@ ; CHECK-LABEL: zextload_flat_i8: ; CHECK: flat_load_ubyte -define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4 %ext = zext i8 %fload to i32 @@ -108,7 +108,7 @@ ; CHECK-LABEL: sextload_flat_i16: ; CHECK: flat_load_sshort -define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4 %ext = sext i16 %fload to i32 @@ -118,7 +118,7 @@ ; CHECK-LABEL: zextload_flat_i16: ; CHECK: flat_load_ushort -define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4 %ext = zext i16 %fload to i32 @@ -131,7 +131,7 @@ ; CHECK: flat_load_ubyte ; CHECK: flat_load_ubyte ; CHECK: flat_load_ubyte -define void @flat_scratch_unaligned_load() { +define amdgpu_kernel void @flat_scratch_unaligned_load() { %scratch = alloca i32 %fptr = addrspacecast i32* %scratch to i32 addrspace(4)* %ld = load volatile i32, i32 addrspace(4)* %fptr, align 1 @@ -143,7 +143,7 @@ ; CHECK: flat_store_byte ; CHECK: flat_store_byte ; CHECK: flat_store_byte -define void @flat_scratch_unaligned_store() { +define amdgpu_kernel void @flat_scratch_unaligned_store() { %scratch = alloca i32 %fptr = addrspacecast i32* %scratch to i32 addrspace(4)* store volatile i32 0, i32 addrspace(4)* %fptr, align 1 @@ -154,7 +154,7 @@ ; HSA: flat_load_dword ; HSA: flat_load_dword ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr -define void @flat_scratch_multidword_load() { +define amdgpu_kernel void @flat_scratch_multidword_load() { %scratch = alloca <2 x i32> %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)* %ld = load volatile <2 x i32>, <2 x i32> addrspace(4)* %fptr @@ -165,7 +165,7 @@ ; HSA: flat_store_dword ; HSA: flat_store_dword ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr -define void @flat_scratch_multidword_store() { +define amdgpu_kernel void @flat_scratch_multidword_store() { %scratch = alloca <2 x i32> %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)* store volatile <2 x i32> zeroinitializer, <2 x i32> addrspace(4)* %fptr Index: test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll =================================================================== --- test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll +++ test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll @@ -23,7 +23,7 @@ ; NOHSA-DEFAULT: buffer_store_dword ; NOHSA-NODEFAULT: flat_store_dword ; NOHSA-NOADDR64: flat_store_dword -define void @test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out) { entry: store i32 0, i32 addrspace(1)* %out ret void @@ -36,7 +36,7 @@ ; NOHSA-DEFAULT: buffer_store_dword ; NOHSA-NODEFAULT: flat_store_dword ; NOHSA-NOADDR64: flat_store_dword -define void @test_addr64(i32 addrspace(1)* %out) { +define amdgpu_kernel void @test_addr64(i32 addrspace(1)* %out) { entry: %out.addr = alloca i32 addrspace(1)*, align 4 Index: test/CodeGen/AMDGPU/flat-scratch-reg.ll =================================================================== --- test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -19,7 +19,7 @@ ; CI: ; NumSgprs: 8 ; VI-NOXNACK: ; NumSgprs: 8 ; VI-XNACK: ; NumSgprs: 12 -define void @no_vcc_no_flat() { +define amdgpu_kernel void @no_vcc_no_flat() { entry: call void asm sideeffect "", "~{SGPR7}"() ret void @@ -33,7 +33,7 @@ ; CI: ; NumSgprs: 10 ; VI-NOXNACK: ; NumSgprs: 10 ; VI-XNACK: ; NumSgprs: 12 -define void @vcc_no_flat() { +define amdgpu_kernel void @vcc_no_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{VCC}"() ret void @@ -50,7 +50,7 @@ ; HSA-CI: ; NumSgprs: 8 ; HSA-VI-NOXNACK: ; NumSgprs: 8 ; HSA-VI-XNACK: ; NumSgprs: 12 -define void @no_vcc_flat() { +define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"() ret void @@ -66,7 +66,7 @@ ; HSA-CI: ; NumSgprs: 10 ; HSA-VI-NOXNACK: ; NumSgprs: 10 ; HSA-VI-XNACK: ; NumSgprs: 12 -define void @vcc_flat() { +define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"() ret void Index: test/CodeGen/AMDGPU/flat_atomics.ll =================================================================== --- test/CodeGen/AMDGPU/flat_atomics.ll +++ test/CodeGen/AMDGPU/flat_atomics.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32_offset: ; GCN: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32_ret_offset: ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst @@ -23,7 +23,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset: ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -34,7 +34,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64_offset: ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -45,7 +45,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32: ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -54,7 +54,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32_ret: ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -63,7 +63,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32_addr64: ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -73,7 +73,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64: ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -83,7 +83,7 @@ ; GCN-LABEL: {{^}}atomic_and_i32_offset: ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst @@ -93,7 +93,7 @@ ; GCN-LABEL: {{^}}atomic_and_i32_ret_offset: ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst @@ -103,7 +103,7 @@ ; GCN-LABEL: {{^}}atomic_and_i32_addr64_offset: ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -114,7 +114,7 @@ ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64_offset: ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -125,7 +125,7 @@ ; GCN-LABEL: {{^}}atomic_and_i32: ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -134,7 +134,7 @@ ; GCN-LABEL: {{^}}atomic_and_i32_ret: ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -143,7 +143,7 @@ ; GCN-LABEL: {{^}}atomic_and_i32_addr64: ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -153,7 +153,7 @@ ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64: ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -163,7 +163,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i32_offset: ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst @@ -173,7 +173,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i32_ret_offset: ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst @@ -183,7 +183,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i32_addr64_offset: ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -194,7 +194,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset: ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -205,7 +205,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i32: ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -214,7 +214,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i32_ret: ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -223,7 +223,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i32_addr64: ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -233,7 +233,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64: ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -243,7 +243,7 @@ ; GCN-LABEL: {{^}}atomic_max_i32_offset: ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst @@ -253,7 +253,7 @@ ; GCN-LABEL: {{^}}atomic_max_i32_ret_offset: ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst @@ -263,7 +263,7 @@ ; GCN-LABEL: {{^}}atomic_max_i32_addr64_offset: ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -274,7 +274,7 @@ ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64_offset: ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -285,7 +285,7 @@ ; GCN-LABEL: {{^}}atomic_max_i32: ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -294,7 +294,7 @@ ; GCN-LABEL: {{^}}atomic_max_i32_ret: ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -303,7 +303,7 @@ ; GCN-LABEL: {{^}}atomic_max_i32_addr64: ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -313,7 +313,7 @@ ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64: ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -323,7 +323,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i32_offset: ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst @@ -333,7 +333,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i32_ret_offset: ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst @@ -343,7 +343,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset: ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -354,7 +354,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset: ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -365,7 +365,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i32: ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -374,7 +374,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i32_ret: ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -383,7 +383,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i32_addr64: ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -393,7 +393,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64: ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -403,7 +403,7 @@ ; GCN-LABEL: {{^}}atomic_min_i32_offset: ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst @@ -413,7 +413,7 @@ ; GCN-LABEL: {{^}}atomic_min_i32_ret_offset: ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst @@ -423,7 +423,7 @@ ; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset: ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -434,7 +434,7 @@ ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64_offset: ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -445,7 +445,7 @@ ; GCN-LABEL: {{^}}atomic_min_i32: ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -454,7 +454,7 @@ ; GCN-LABEL: {{^}}atomic_min_i32_ret: ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -463,7 +463,7 @@ ; GCN-LABEL: {{^}}atomic_min_i32_addr64: ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -473,7 +473,7 @@ ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64: ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -483,7 +483,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i32_offset: ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst @@ -493,7 +493,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i32_ret_offset: ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst @@ -503,7 +503,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i32_addr64_offset: ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -514,7 +514,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset: ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -525,7 +525,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i32: ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -534,7 +534,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i32_ret: ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -543,7 +543,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i32_addr64: ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -553,7 +553,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64: ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]{{$}} - define void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { + define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -563,7 +563,7 @@ ; GCN-LABEL: {{^}}atomic_or_i32_offset: ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst @@ -573,7 +573,7 @@ ; GCN-LABEL: {{^}}atomic_or_i32_ret_offset: ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst @@ -583,7 +583,7 @@ ; GCN-LABEL: {{^}}atomic_or_i32_addr64_offset: ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -594,7 +594,7 @@ ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64_offset: ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -605,7 +605,7 @@ ; GCN-LABEL: {{^}}atomic_or_i32: ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -614,7 +614,7 @@ ; GCN-LABEL: {{^}}atomic_or_i32_ret: ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -623,7 +623,7 @@ ; GCN-LABEL: {{^}}atomic_or_i32_addr64: ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -633,7 +633,7 @@ ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64: ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -643,7 +643,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i32_offset: ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst @@ -653,7 +653,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset: ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst @@ -663,7 +663,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64_offset: ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -674,7 +674,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset: ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -685,7 +685,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i32: ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -694,7 +694,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i32_ret: ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -703,7 +703,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64: ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -713,7 +713,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64: ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -725,7 +725,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset: ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst @@ -735,7 +735,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset: ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]] -define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst @@ -746,7 +746,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset: ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -757,7 +757,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset: ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]] -define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -769,7 +769,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i32: ; GCN: flat_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) { entry: %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst ret void @@ -778,7 +778,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret: ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]] -define void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) { entry: %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst %flag = extractvalue { i32, i1 } %val, 0 @@ -788,7 +788,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64: ; GCN: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst @@ -798,7 +798,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64: ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]] -define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst @@ -809,7 +809,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i32_offset: ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst @@ -819,7 +819,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i32_ret_offset: ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst @@ -829,7 +829,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i32_addr64_offset: ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -840,7 +840,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset: ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -851,7 +851,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i32: ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -860,7 +860,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i32_ret: ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -869,7 +869,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i32_addr64: ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -879,7 +879,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64: ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -890,7 +890,7 @@ ; GCN-LABEL: {{^}}atomic_load_i32_offset: ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) { entry: %gep = getelementptr i32, i32 addrspace(4)* %in, i32 4 %val = load atomic i32, i32 addrspace(4)* %gep seq_cst, align 4 @@ -901,7 +901,7 @@ ; GCN-LABEL: {{^}}atomic_load_i32: ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) { entry: %val = load atomic i32, i32 addrspace(4)* %in seq_cst, align 4 store i32 %val, i32 addrspace(4)* %out @@ -911,7 +911,7 @@ ; GCN-LABEL: {{^}}atomic_load_i32_addr64_offset: ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -923,7 +923,7 @@ ; GCN-LABEL: {{^}}atomic_load_i32_addr64: ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index %val = load atomic i32, i32 addrspace(4)* %ptr seq_cst, align 4 @@ -933,7 +933,7 @@ ; GCN-LABEL: {{^}}atomic_store_i32_offset: ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 store atomic i32 %in, i32 addrspace(4)* %gep seq_cst, align 4 @@ -942,7 +942,7 @@ ; GCN-LABEL: {{^}}atomic_store_i32: ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) { entry: store atomic i32 %in, i32 addrspace(4)* %out seq_cst, align 4 ret void @@ -950,7 +950,7 @@ ; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset: ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -960,7 +960,7 @@ ; GCN-LABEL: {{^}}atomic_store_i32_addr64: ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index store atomic i32 %in, i32 addrspace(4)* %ptr seq_cst, align 4 Index: test/CodeGen/AMDGPU/flat_atomics_i64.ll =================================================================== --- test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_offset: ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -define void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset: ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst @@ -23,7 +23,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset: ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -define void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -34,7 +34,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64_offset: ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -45,7 +45,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64: ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -54,7 +54,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_ret: ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -63,7 +63,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_addr64: ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -73,7 +73,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64: ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -83,7 +83,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_offset: ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst @@ -93,7 +93,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset: ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst @@ -103,7 +103,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset: ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -114,7 +114,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64_offset: ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -125,7 +125,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64: ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -134,7 +134,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_ret: ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -143,7 +143,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_addr64: ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -153,7 +153,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64: ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -163,7 +163,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_offset: ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst @@ -173,7 +173,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset: ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst @@ -183,7 +183,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset: ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -194,7 +194,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64_offset: ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -205,7 +205,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64: ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -214,7 +214,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_ret: ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -223,7 +223,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_addr64: ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -233,7 +233,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64: ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -243,7 +243,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_offset: ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst @@ -253,7 +253,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset: ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst @@ -263,7 +263,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset: ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -274,7 +274,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64_offset: ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -285,7 +285,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64: ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -294,7 +294,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_ret: ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -303,7 +303,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_addr64: ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -313,7 +313,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64: ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -323,7 +323,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_offset: ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst @@ -333,7 +333,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset: ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst @@ -343,7 +343,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset: ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -354,7 +354,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64_offset: ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -365,7 +365,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64: ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -374,7 +374,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_ret: ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -383,7 +383,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_addr64: ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -393,7 +393,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64: ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -403,7 +403,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_offset: ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst @@ -413,7 +413,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset: ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst @@ -423,7 +423,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset: ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -434,7 +434,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64_offset: ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -445,7 +445,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64: ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -454,7 +454,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_ret: ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -463,7 +463,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_addr64: ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -473,7 +473,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64: ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -483,7 +483,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_offset: ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst @@ -493,7 +493,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset: ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst @@ -503,7 +503,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset: ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -514,7 +514,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64_offset: ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -525,7 +525,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64: ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -534,7 +534,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_ret: ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -543,7 +543,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_addr64: ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -553,7 +553,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64: ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -563,7 +563,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_offset: ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst @@ -573,7 +573,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset: ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst @@ -583,7 +583,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset: ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -594,7 +594,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64_offset: ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -605,7 +605,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64: ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -614,7 +614,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_ret: ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -623,7 +623,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_addr64: ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -633,7 +633,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64: ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -643,7 +643,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_offset: ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst @@ -653,7 +653,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset: ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst @@ -663,7 +663,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset: ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -674,7 +674,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64_offset: ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -685,7 +685,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64: ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -694,7 +694,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_ret: ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -703,7 +703,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64: ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -713,7 +713,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64: ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -723,7 +723,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_offset: ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst @@ -733,7 +733,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset: ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst @@ -743,7 +743,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset: ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -754,7 +754,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64_offset: ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -765,7 +765,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64: ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -774,7 +774,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_ret: ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -783,7 +783,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_addr64: ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -793,7 +793,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64: ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -804,7 +804,7 @@ ; GCN-LABEL: {{^}}atomic_load_i64_offset: ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) { entry: %gep = getelementptr i64, i64 addrspace(4)* %in, i64 4 %val = load atomic i64, i64 addrspace(4)* %gep seq_cst, align 8 @@ -815,7 +815,7 @@ ; GCN-LABEL: {{^}}atomic_load_i64: ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) { entry: %val = load atomic i64, i64 addrspace(4)* %in seq_cst, align 8 store i64 %val, i64 addrspace(4)* %out @@ -825,7 +825,7 @@ ; GCN-LABEL: {{^}}atomic_load_i64_addr64_offset: ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -837,7 +837,7 @@ ; GCN-LABEL: {{^}}atomic_load_i64_addr64: ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index %val = load atomic i64, i64 addrspace(4)* %ptr seq_cst, align 8 @@ -847,7 +847,7 @@ ; GCN-LABEL: {{^}}atomic_store_i64_offset: ; GCN: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -define void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 store atomic i64 %in, i64 addrspace(4)* %gep seq_cst, align 8 @@ -856,7 +856,7 @@ ; GCN-LABEL: {{^}}atomic_store_i64: ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc -define void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) { entry: store atomic i64 %in, i64 addrspace(4)* %out seq_cst, align 8 ret void @@ -864,7 +864,7 @@ ; GCN-LABEL: {{^}}atomic_store_i64_addr64_offset: ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}} -define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -874,7 +874,7 @@ ; GCN-LABEL: {{^}}atomic_store_i64_addr64: ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}} -define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index store atomic i64 %in, i64 addrspace(4)* %ptr seq_cst, align 8 @@ -883,7 +883,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_offset: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst @@ -892,7 +892,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_soffset: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 9000 %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst @@ -902,7 +902,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset: ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst @@ -913,7 +913,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -924,7 +924,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset: ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -936,7 +936,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i64: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) { entry: %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst ret void @@ -945,7 +945,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret: ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) { entry: %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst %extract0 = extractvalue { i64, i1 } %val, 0 @@ -955,7 +955,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst @@ -965,7 +965,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64: ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst Index: test/CodeGen/AMDGPU/fma-combine.ll =================================================================== --- test/CodeGen/AMDGPU/fma-combine.ll +++ test/CodeGen/AMDGPU/fma-combine.ll @@ -18,7 +18,7 @@ ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -46,7 +46,7 @@ ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI: s_endpgm -define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -75,7 +75,7 @@ ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -99,7 +99,7 @@ ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -127,7 +127,7 @@ ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI: s_endpgm -define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -156,7 +156,7 @@ ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -184,7 +184,7 @@ ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI: s_endpgm -define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -213,7 +213,7 @@ ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -242,7 +242,7 @@ ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI: s_endpgm -define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -276,7 +276,7 @@ ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI: s_endpgm -define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -317,7 +317,7 @@ ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -358,7 +358,7 @@ ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -390,7 +390,7 @@ ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] -define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load volatile float, float addrspace(1)* %in1 @@ -406,7 +406,7 @@ ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] -define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load volatile float, float addrspace(1)* %in1 @@ -422,7 +422,7 @@ ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] -define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -438,7 +438,7 @@ ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] -define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -454,7 +454,7 @@ ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] -define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -470,7 +470,7 @@ ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] -define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -486,7 +486,7 @@ ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] -define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -502,7 +502,7 @@ ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] -define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -518,7 +518,7 @@ ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] -define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -534,7 +534,7 @@ ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] -define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -550,7 +550,7 @@ ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] -define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -566,7 +566,7 @@ ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] -define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -588,7 +588,7 @@ ; ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]] ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]] -define void @test_f32_interp(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2, float addrspace(1)* %in3) { @@ -610,7 +610,7 @@ ; ; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]] ; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]] -define void @test_f64_interp(double addrspace(1)* %out, +define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { Index: test/CodeGen/AMDGPU/fma.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fma.f64.ll +++ test/CodeGen/AMDGPU/fma.f64.ll @@ -8,7 +8,7 @@ ; FUNC-LABEL: {{^}}fma_f64: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -21,7 +21,7 @@ ; FUNC-LABEL: {{^}}fma_v2f64: ; SI: v_fma_f64 ; SI: v_fma_f64 -define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, +define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) { %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 @@ -36,7 +36,7 @@ ; SI: v_fma_f64 ; SI: v_fma_f64 ; SI: v_fma_f64 -define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, +define amdgpu_kernel void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) { %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1 %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2 Index: test/CodeGen/AMDGPU/fma.ll =================================================================== --- test/CodeGen/AMDGPU/fma.ll +++ test/CodeGen/AMDGPU/fma.ll @@ -12,7 +12,7 @@ ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, ; EG: FMA {{\*? *}}[[RES]] -define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, +define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2, float addrspace(1)* %in3) { %r0 = load float, float addrspace(1)* %in1 %r1 = load float, float addrspace(1)* %in2 @@ -29,7 +29,7 @@ ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}}, ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]] ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]] -define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, +define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) { %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1 %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2 @@ -50,7 +50,7 @@ ; EG-DAG: FMA {{\*? *}}[[RES]].Y ; EG-DAG: FMA {{\*? *}}[[RES]].Z ; EG-DAG: FMA {{\*? *}}[[RES]].W -define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, +define amdgpu_kernel void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) { %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1 %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2 @@ -62,7 +62,7 @@ ; FUNC-LABEL: @fma_commute_mul_inline_imm_f32 ; SI: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, 2.0, {{v[0-9]+}} -define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -77,7 +77,7 @@ } ; FUNC-LABEL: @fma_commute_mul_s_f32 -define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind { +define amdgpu_kernel void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid Index: test/CodeGen/AMDGPU/fmax3.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fmax3.f64.ll +++ test/CodeGen/AMDGPU/fmax3.f64.ll @@ -11,7 +11,7 @@ ; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]] ; SI: buffer_store_dwordx2 [[RESULT]], ; SI: s_endpgm -define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { +define amdgpu_kernel void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1 %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2 %a = load volatile double, double addrspace(1)* %aptr, align 8 Index: test/CodeGen/AMDGPU/fmax3.ll =================================================================== --- test/CodeGen/AMDGPU/fmax3.ll +++ test/CodeGen/AMDGPU/fmax3.ll @@ -10,7 +10,7 @@ ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { %a = load volatile float, float addrspace(1)* %aptr, align 4 %b = load volatile float, float addrspace(1)* %bptr, align 4 %c = load volatile float, float addrspace(1)* %cptr, align 4 @@ -28,7 +28,7 @@ ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { %a = load volatile float, float addrspace(1)* %aptr, align 4 %b = load volatile float, float addrspace(1)* %bptr, align 4 %c = load volatile float, float addrspace(1)* %cptr, align 4 Index: test/CodeGen/AMDGPU/fmax_legacy.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fmax_legacy.f64.ll +++ test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 ; FUNC-LABEL: @test_fmax_legacy_uge_f64 -define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -19,7 +19,7 @@ } ; FUNC-LABEL: @test_fmax_legacy_oge_f64 -define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -34,7 +34,7 @@ } ; FUNC-LABEL: @test_fmax_legacy_ugt_f64 -define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -49,7 +49,7 @@ } ; FUNC-LABEL: @test_fmax_legacy_ogt_f64 -define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 Index: test/CodeGen/AMDGPU/fmax_legacy.ll =================================================================== --- test/CodeGen/AMDGPU/fmax_legacy.ll +++ test/CodeGen/AMDGPU/fmax_legacy.ll @@ -13,7 +13,7 @@ ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; EG: MAX -define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -33,7 +33,7 @@ ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; EG: MAX -define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -53,7 +53,7 @@ ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; EG: MAX -define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -73,7 +73,7 @@ ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; EG: MAX -define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -93,7 +93,7 @@ ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; EG: MAX -define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1 @@ -114,7 +114,7 @@ ; SI-NONAN: v_max_f32_e32 ; SI-NONAN: v_max_f32_e32 ; SI-NONAN: v_max_f32_e32 -define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1 @@ -137,7 +137,7 @@ ; SI-NOT: v_max_ ; EG: MAX -define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 Index: test/CodeGen/AMDGPU/fmaxnum.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fmaxnum.f64.ll +++ test/CodeGen/AMDGPU/fmaxnum.f64.ll @@ -9,7 +9,7 @@ ; FUNC-LABEL: @test_fmax_f64 ; SI: v_max_f64 -define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind { +define amdgpu_kernel void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind { %val = call double @llvm.maxnum.f64(double %a, double %b) #0 store double %val, double addrspace(1)* %out, align 8 ret void @@ -18,7 +18,7 @@ ; FUNC-LABEL: @test_fmax_v2f64 ; SI: v_max_f64 ; SI: v_max_f64 -define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { +define amdgpu_kernel void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0 store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16 ret void @@ -29,7 +29,7 @@ ; SI: v_max_f64 ; SI: v_max_f64 ; SI: v_max_f64 -define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { +define amdgpu_kernel void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0 store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32 ret void @@ -44,7 +44,7 @@ ; SI: v_max_f64 ; SI: v_max_f64 ; SI: v_max_f64 -define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { +define amdgpu_kernel void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0 store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64 ret void @@ -67,7 +67,7 @@ ; SI: v_max_f64 ; SI: v_max_f64 ; SI: v_max_f64 -define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { +define amdgpu_kernel void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0 store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128 ret void Index: test/CodeGen/AMDGPU/fmaxnum.ll =================================================================== --- test/CodeGen/AMDGPU/fmaxnum.ll +++ test/CodeGen/AMDGPU/fmaxnum.ll @@ -14,7 +14,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MAX_DX10 {{.*}}[[OUT]] -define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind { +define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind { %val = call float @llvm.maxnum.f32(float %a, float %b) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -27,7 +27,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] ; EG: MAX_DX10 {{.*}}[[OUT]] ; EG: MAX_DX10 {{.*}}[[OUT]] -define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { +define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0 store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 ret void @@ -44,7 +44,7 @@ ; EG: MAX_DX10 {{.*}}[[OUT]] ; EG: MAX_DX10 {{.*}}[[OUT]] ; EG: MAX_DX10 {{.*}}[[OUT]] -define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { +define amdgpu_kernel void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0 store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 ret void @@ -70,7 +70,7 @@ ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W -define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { +define amdgpu_kernel void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0 store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 ret void @@ -114,7 +114,7 @@ ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W -define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { +define amdgpu_kernel void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0 store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 ret void @@ -128,7 +128,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -143,7 +143,7 @@ ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} ; EG: 2143289344(nan) -define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -157,7 +157,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -171,7 +171,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -185,7 +185,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -199,7 +199,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -213,7 +213,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -227,7 +227,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -239,7 +239,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -250,7 +250,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -262,7 +262,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -274,7 +274,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0 store float %val, float addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/fmed3.ll =================================================================== --- test/CodeGen/AMDGPU/fmed3.ll +++ test/CodeGen/AMDGPU/fmed3.ll @@ -9,7 +9,7 @@ ; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32: ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}} ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 2.0, 4.0 -define void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -27,7 +27,7 @@ ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -45,7 +45,7 @@ ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -63,7 +63,7 @@ ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -79,7 +79,7 @@ ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32: ; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} ; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -96,7 +96,7 @@ ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32: ; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} ; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -113,7 +113,7 @@ ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64: ; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0 ; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0 -define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid @@ -128,7 +128,7 @@ ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32: ; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 -define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -146,7 +146,7 @@ ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -169,7 +169,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], [[B]], [[C]] -define void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -192,7 +192,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], -[[B]], [[C]] -define void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -215,7 +215,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], -[[C]] -define void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -238,7 +238,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], |[[B]]|, -|[[C]]| -define void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -267,7 +267,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, -|[[A]]|, -|[[B]]|, -|[[C]]| -define void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -301,7 +301,7 @@ ; GCN-DAG: v_add_f32_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]] ; GCN-DAG: v_add_f32_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]] -define void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -341,7 +341,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] -define void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -363,7 +363,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] -define void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -385,7 +385,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] -define void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -407,7 +407,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] -define void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -429,7 +429,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] -define void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -451,7 +451,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] -define void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -473,7 +473,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] -define void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -495,7 +495,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] -define void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -517,7 +517,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] -define void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -539,7 +539,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] -define void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -561,7 +561,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] -define void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -583,7 +583,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] -define void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -605,7 +605,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] -define void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -627,7 +627,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] -define void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -649,7 +649,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] -define void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -671,7 +671,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] -define void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -697,7 +697,7 @@ ; GCN-DAG: v_max_f32 ; GCN: v_min_f32 ; GCN: v_max_f32 -define void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { +define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -716,7 +716,7 @@ } ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use1: -define void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { +define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -735,7 +735,7 @@ } ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use2: -define void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { +define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -755,7 +755,7 @@ ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0: -define void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { +define amdgpu_kernel void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -773,7 +773,7 @@ } ; GCN-LABEL: {{^}}v_nnan_inputs_missing0_med3_f32_pat0: -define void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -796,7 +796,7 @@ } ; GCN-LABEL: {{^}}v_nnan_inputs_missing1_med3_f32_pat0: -define void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -819,7 +819,7 @@ } ; GCN-LABEL: {{^}}v_nnan_inputs_missing2_med3_f32_pat0: -define void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -849,7 +849,7 @@ ; GCN: v_max_f32 ; GCN: v_min_f32 ; GCN: v_max_f32 -define void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -874,7 +874,7 @@ ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[B]], [[A]] ; GCN: v_min_f32_e32 v{{[0-9]+}}, [[C]], [[MAX]] -define void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -901,7 +901,7 @@ ; GFX9: v_add_f16_e32 v{{[0-9]+}}, 1.0 ; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0 -define void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -938,7 +938,7 @@ ; VI: v_max_f16 ; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]] -define void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid Index: test/CodeGen/AMDGPU/fmin3.ll =================================================================== --- test/CodeGen/AMDGPU/fmin3.ll +++ test/CodeGen/AMDGPU/fmin3.ll @@ -11,7 +11,7 @@ ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { %a = load volatile float, float addrspace(1)* %aptr, align 4 %b = load volatile float, float addrspace(1)* %bptr, align 4 %c = load volatile float, float addrspace(1)* %cptr, align 4 @@ -29,7 +29,7 @@ ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { %a = load volatile float, float addrspace(1)* %aptr, align 4 %b = load volatile float, float addrspace(1)* %bptr, align 4 %c = load volatile float, float addrspace(1)* %cptr, align 4 Index: test/CodeGen/AMDGPU/fmin_legacy.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fmin_legacy.f64.ll +++ test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -3,7 +3,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 ; FUNC-LABEL: @test_fmin_legacy_f64 -define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 { +define amdgpu_kernel void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 { %r0 = extractelement <4 x double> %reg0, i32 0 %r1 = extractelement <4 x double> %reg0, i32 1 %r2 = fcmp uge double %r0, %r1 @@ -14,7 +14,7 @@ } ; FUNC-LABEL: @test_fmin_legacy_ule_f64 -define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -29,7 +29,7 @@ } ; FUNC-LABEL: @test_fmin_legacy_ole_f64 -define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -44,7 +44,7 @@ } ; FUNC-LABEL: @test_fmin_legacy_olt_f64 -define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -59,7 +59,7 @@ } ; FUNC-LABEL: @test_fmin_legacy_ult_f64 -define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 Index: test/CodeGen/AMDGPU/fmin_legacy.ll =================================================================== --- test/CodeGen/AMDGPU/fmin_legacy.ll +++ test/CodeGen/AMDGPU/fmin_legacy.ll @@ -14,7 +14,7 @@ ; EG: MIN * ; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 { +define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 { %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = extractelement <4 x float> %reg0, i32 1 %r2 = fcmp uge float %r0, %r1 @@ -34,7 +34,7 @@ ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]] ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]] -define void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 { %cmp = fcmp ule float %a, %b %val = select i1 %cmp, float %a, float %b store float %val, float addrspace(1)* %out, align 4 @@ -46,7 +46,7 @@ ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -65,7 +65,7 @@ ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -84,7 +84,7 @@ ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -103,7 +103,7 @@ ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -122,7 +122,7 @@ ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1 @@ -144,7 +144,7 @@ ; SI-NONAN: v_min_f32_e32 ; SI-NONAN: v_min_f32_e32 -define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid %gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %gep.0, i32 1 @@ -166,7 +166,7 @@ ; SI-NONAN: v_min_f32_e32 ; SI-NONAN: v_min_f32_e32 ; SI-NONAN: v_min_f32_e32 -define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1 @@ -188,7 +188,7 @@ ; SI-NEXT: v_cndmask_b32 ; SI-NOT: v_min ; SI: s_endpgm -define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 Index: test/CodeGen/AMDGPU/fminnum.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fminnum.f64.ll +++ test/CodeGen/AMDGPU/fminnum.f64.ll @@ -9,7 +9,7 @@ ; FUNC-LABEL: @test_fmin_f64 ; SI: v_min_f64 -define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind { +define amdgpu_kernel void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind { %val = call double @llvm.minnum.f64(double %a, double %b) #0 store double %val, double addrspace(1)* %out, align 8 ret void @@ -18,7 +18,7 @@ ; FUNC-LABEL: @test_fmin_v2f64 ; SI: v_min_f64 ; SI: v_min_f64 -define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { +define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0 store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16 ret void @@ -29,7 +29,7 @@ ; SI: v_min_f64 ; SI: v_min_f64 ; SI: v_min_f64 -define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { +define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0 store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32 ret void @@ -44,7 +44,7 @@ ; SI: v_min_f64 ; SI: v_min_f64 ; SI: v_min_f64 -define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { +define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0 store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64 ret void @@ -67,7 +67,7 @@ ; SI: v_min_f64 ; SI: v_min_f64 ; SI: v_min_f64 -define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { +define amdgpu_kernel void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0 store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128 ret void Index: test/CodeGen/AMDGPU/fminnum.ll =================================================================== --- test/CodeGen/AMDGPU/fminnum.ll +++ test/CodeGen/AMDGPU/fminnum.ll @@ -13,7 +13,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MIN_DX10 {{.*}}[[OUT]] -define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind { +define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind { %val = call float @llvm.minnum.f32(float %a, float %b) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -26,7 +26,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] ; EG: MIN_DX10 {{.*}}[[OUT]] ; EG: MIN_DX10 {{.*}}[[OUT]] -define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { +define amdgpu_kernel void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0 store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 ret void @@ -43,7 +43,7 @@ ; EG: MIN_DX10 {{.*}}[[OUT]] ; EG: MIN_DX10 {{.*}}[[OUT]] ; EG: MIN_DX10 {{.*}}[[OUT]] -define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { +define amdgpu_kernel void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0 store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 ret void @@ -69,7 +69,7 @@ ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W -define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { +define amdgpu_kernel void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0 store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 ret void @@ -113,7 +113,7 @@ ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W -define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { +define amdgpu_kernel void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0 store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 ret void @@ -127,7 +127,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -142,7 +142,7 @@ ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} ; EG: 2143289344({{nan|1\.#QNAN0e\+00}}) -define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -156,7 +156,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -170,7 +170,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -184,7 +184,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -198,7 +198,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -212,7 +212,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -226,7 +226,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -237,7 +237,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.minnum.f32(float %a, float 2.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -248,7 +248,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.minnum.f32(float 2.0, float %a) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -260,7 +260,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.minnum.f32(float %a, float 99.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -272,7 +272,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.minnum.f32(float 99.0, float %a) #0 store float %val, float addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll =================================================================== --- test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -23,7 +23,7 @@ ; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| ; VI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 -define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { +define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { %a11 = fadd fast float %y, -1.0 %a12 = call float @llvm.fabs.f32(float %a11) %a13 = fadd fast float %x, -1.0 @@ -44,7 +44,7 @@ ; GCN-DAG: buffer_store_dword [[MUL2]] ; GCN-DAG: buffer_store_dword [[MAD]] ; GCN: s_endpgm -define void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 { +define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %mul2 = fmul fast float %x, 2.0 %mad = fadd fast float %mul2, %y @@ -59,7 +59,7 @@ ; GCN-DAG: buffer_store_dword [[MUL2]] ; GCN-DAG: buffer_store_dword [[MAD]] ; GCN: s_endpgm -define void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 { +define amdgpu_kernel void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %x.abs = call float @llvm.fabs.f32(float %x) %mul2 = fmul fast float %x.abs, 2.0 @@ -72,7 +72,7 @@ ; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f32: ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}} ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}} -define void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { +define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %x.abs = call float @llvm.fabs.f32(float %x) %mul2 = fmul fast float %x.abs, 2.0 @@ -87,7 +87,7 @@ ; GCN: v_mul_f32_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] ; GCN: buffer_store_dword [[RESULT]] -define void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 { +define amdgpu_kernel void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %mul2 = fmul fast float %x, 2.0 %muln2 = fmul fast float %x, -2.0 @@ -101,7 +101,7 @@ ; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]] ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] ; GCN: buffer_store_dword [[RESULT]] -define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 { +define amdgpu_kernel void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %mul2 = fmul fast float %x, 2.0 %muln2 = fmul fast float %x, -3.0 @@ -119,7 +119,7 @@ ; VI: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 ; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 -define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { +define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half %z = bitcast i16 %z.arg to half @@ -146,7 +146,7 @@ ; GCN-DAG: buffer_store_short [[MUL2]] ; GCN-DAG: buffer_store_short [[MAD]] ; GCN: s_endpgm -define void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { +define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 @@ -166,7 +166,7 @@ ; GCN-DAG: buffer_store_short [[MUL2]] ; GCN-DAG: buffer_store_short [[MAD]] ; GCN: s_endpgm -define void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { +define amdgpu_kernel void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 @@ -185,7 +185,7 @@ ; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}} ; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}} -define void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { +define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half %z = bitcast i16 %z.arg to half @@ -203,7 +203,7 @@ ; GCN: v_mul_f16_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] ; GCN: buffer_store_short [[RESULT]] -define void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { +define amdgpu_kernel void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 @@ -219,7 +219,7 @@ ; GCN: v_mul_f16_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]] ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] ; GCN: buffer_store_short [[RESULT]] -define void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { +define amdgpu_kernel void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 Index: test/CodeGen/AMDGPU/fmul.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fmul.f16.ll +++ test/CodeGen/AMDGPU/fmul.f16.ll @@ -11,7 +11,7 @@ ; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fmul_f16( +define amdgpu_kernel void @fmul_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -31,7 +31,7 @@ ; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fmul_f16_imm_a( +define amdgpu_kernel void @fmul_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -50,7 +50,7 @@ ; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fmul_f16_imm_b( +define amdgpu_kernel void @fmul_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -83,7 +83,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fmul_v2f16( +define amdgpu_kernel void @fmul_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -110,7 +110,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fmul_v2f16_imm_a( +define amdgpu_kernel void @fmul_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: @@ -135,7 +135,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fmul_v2f16_imm_b( +define amdgpu_kernel void @fmul_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/fmul.ll =================================================================== --- test/CodeGen/AMDGPU/fmul.ll +++ test/CodeGen/AMDGPU/fmul.ll @@ -6,7 +6,7 @@ ; GCN: v_mul_f32 ; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W -define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) { +define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float %a, float %b) { entry: %0 = fmul float %a, %b store float %0, float addrspace(1)* %out @@ -19,7 +19,7 @@ ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}} ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}} -define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { entry: %0 = fmul <2 x float> %a, %b store <2 x float> %0, <2 x float> addrspace(1)* %out @@ -36,7 +36,7 @@ ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +define amdgpu_kernel void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr @@ -49,7 +49,7 @@ ; GCN: v_mul_f32 ; GCN-NOT: v_mul_f32 ; GCN: s_endpgm -define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 { %y = fmul float %x, 2.0 %z = fmul float %y, 3.0 store float %z, float addrspace(1)* %out @@ -61,7 +61,7 @@ ; GCN-NOT: v_mul_f32 ; GCN-NOT: v_mad_f32 ; GCN: s_endpgm -define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 { %y = fmul float %x, 3.0 %z = fmul float %y, 2.0 store float %z, float addrspace(1)* %out @@ -75,7 +75,7 @@ ; GCN: v_mul_f32 ; GCN: v_mul_f32 ; GCN-NOT: v_mul_f32 -define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 { +define amdgpu_kernel void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 { %a = fmul float %x, 5.0 %b = fsub float -0.0, %a %c = fmul float %b, %y Index: test/CodeGen/AMDGPU/fmul64.ll =================================================================== --- test/CodeGen/AMDGPU/fmul64.ll +++ test/CodeGen/AMDGPU/fmul64.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}fmul_f64: ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -15,7 +15,7 @@ ; FUNC-LABEL: {{^}}fmul_v2f64: ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, +define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, <2 x double> addrspace(1)* %in2) { %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 @@ -29,7 +29,7 @@ ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, +define amdgpu_kernel void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, <4 x double> addrspace(1)* %in2) { %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1 %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2 Index: test/CodeGen/AMDGPU/fmuladd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fmuladd.f16.ll +++ test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -16,7 +16,7 @@ ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} -define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, +define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { %r0 = load half, half addrspace(1)* %in1 %r1 = load half, half addrspace(1)* %in2 @@ -34,7 +34,7 @@ ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -56,7 +56,7 @@ ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -82,7 +82,7 @@ ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fadd_a_a_b_f16(half addrspace(1)* %out, +define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -111,7 +111,7 @@ ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fadd_b_a_a_f16(half addrspace(1)* %out, +define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -134,7 +134,7 @@ ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] ; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -156,7 +156,7 @@ ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -180,7 +180,7 @@ ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -202,7 +202,7 @@ ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -231,7 +231,7 @@ ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext @@ -261,7 +261,7 @@ ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext @@ -291,7 +291,7 @@ ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext @@ -323,7 +323,7 @@ ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext @@ -355,7 +355,7 @@ ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext @@ -388,7 +388,7 @@ ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext @@ -419,7 +419,7 @@ ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { +define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -447,7 +447,7 @@ ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { +define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 Index: test/CodeGen/AMDGPU/fmuladd.f32.ll =================================================================== --- test/CodeGen/AMDGPU/fmuladd.f32.ll +++ test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -25,7 +25,7 @@ ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} ; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, +define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { %r0 = load float, float addrspace(1)* %in1 %r1 = load float, float addrspace(1)* %in2 @@ -45,7 +45,7 @@ ; GCN-DENORM-STRICT: v_mul_f32_e32 ; GCN-DENORM-STRICT: v_add_f32_e32 -define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, +define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { %r0 = load volatile float, float addrspace(1)* %in1 %r1 = load volatile float, float addrspace(1)* %in2 @@ -71,7 +71,7 @@ ; SI-DENORM buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -100,7 +100,7 @@ ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -132,7 +132,7 @@ ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fadd_a_a_b_f32(float addrspace(1)* %out, +define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -167,7 +167,7 @@ ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fadd_b_a_a_f32(float addrspace(1)* %out, +define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -196,7 +196,7 @@ ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -225,7 +225,7 @@ ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -256,7 +256,7 @@ ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -286,7 +286,7 @@ ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -318,7 +318,7 @@ ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { +define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext @@ -353,7 +353,7 @@ ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { +define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext @@ -387,7 +387,7 @@ ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { +define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext @@ -422,7 +422,7 @@ ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { +define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext @@ -460,7 +460,7 @@ ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { +define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext @@ -496,7 +496,7 @@ ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { +define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext @@ -532,7 +532,7 @@ ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -563,7 +563,7 @@ ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 Index: test/CodeGen/AMDGPU/fmuladd.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fmuladd.f64.ll +++ test/CodeGen/AMDGPU/fmuladd.f64.ll @@ -7,7 +7,7 @@ ; GCN-LABEL: {{^}}fmuladd_f64: ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) #0 { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -22,7 +22,7 @@ ; GCN-STRICT: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; GCN-STRICT: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) #0 { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -44,7 +44,7 @@ ; SI: buffer_store_dwordx2 [[RESULT]] ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fadd_a_a_b_f64(double addrspace(1)* %out, +define amdgpu_kernel void @fadd_a_a_b_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -72,7 +72,7 @@ ; SI: buffer_store_dwordx2 [[RESULT]] ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fadd_b_a_a_f64(double addrspace(1)* %out, +define amdgpu_kernel void @fadd_b_a_a_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -94,7 +94,7 @@ ; GCN-STRICT: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} ; GCN-CONTRACT: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} -define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext @@ -117,7 +117,7 @@ ; GCN-STRICT: v_add_f64 ; GCN-CONTRACT: v_fma_f64 -define void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out, +define amdgpu_kernel void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -139,7 +139,7 @@ ; GCN-STRICT: v_add_f64 ; GCN-CONTRACT: v_fma_f64 -define void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out, +define amdgpu_kernel void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -158,7 +158,7 @@ ; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast: ; GCN: v_fma_f64 -define void @fadd_a_a_b_f64_fast(double addrspace(1)* %out, +define amdgpu_kernel void @fadd_a_a_b_f64_fast(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone Index: test/CodeGen/AMDGPU/fmuladd.v2f16.ll =================================================================== --- test/CodeGen/AMDGPU/fmuladd.v2f16.ll +++ test/CodeGen/AMDGPU/fmuladd.v2f16.ll @@ -17,7 +17,7 @@ ; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} ; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} -define void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, +define amdgpu_kernel void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 { %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1 %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2 @@ -37,7 +37,7 @@ ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1 @@ -61,7 +61,7 @@ ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1 @@ -86,7 +86,7 @@ ; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out, +define amdgpu_kernel void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, <2 x half> addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/fnearbyint.ll =================================================================== --- test/CodeGen/AMDGPU/fnearbyint.ll +++ test/CodeGen/AMDGPU/fnearbyint.ll @@ -13,41 +13,41 @@ declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0 -define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 { +define amdgpu_kernel void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 { entry: %0 = call float @llvm.nearbyint.f32(float %in) store float %0, float addrspace(1)* %out ret void } -define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { +define amdgpu_kernel void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { entry: %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in) store <2 x float> %0, <2 x float> addrspace(1)* %out ret void } -define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { +define amdgpu_kernel void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { entry: %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in) store <4 x float> %0, <4 x float> addrspace(1)* %out ret void } -define void @nearbyint_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @nearbyint_f64(double addrspace(1)* %out, double %in) { entry: %0 = call double @llvm.nearbyint.f64(double %in) store double %0, double addrspace(1)* %out ret void } -define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { entry: %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in) store <2 x double> %0, <2 x double> addrspace(1)* %out ret void } -define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { entry: %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in) store <4 x double> %0, <4 x double> addrspace(1)* %out Index: test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-combines.ll +++ test/CodeGen/AMDGPU/fneg-combines.ll @@ -14,7 +14,7 @@ ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]] ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]] -define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -35,7 +35,7 @@ ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_ADD]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -62,7 +62,7 @@ ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[ADD]] ; GCN: buffer_store_dword [[NEG_ADD]] ; GCN-NEXT: buffer_store_dword [[MUL]] -define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -87,7 +87,7 @@ ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -111,7 +111,7 @@ ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -135,7 +135,7 @@ ; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -164,7 +164,7 @@ ; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]] ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]] -define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -192,7 +192,7 @@ ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]] ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]] -define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { +define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -218,7 +218,7 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]] ; GCN-NEXT: buffer_store_dword [[RESULT]] -define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -239,7 +239,7 @@ ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL]] ; GCN: buffer_store_dword [[ADD]] -define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -261,7 +261,7 @@ ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]] ; GCN-NEXT: buffer_store_dword [[MUL0]] ; GCN-NEXT: buffer_store_dword [[MUL1]] -define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -282,7 +282,7 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -302,7 +302,7 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -322,7 +322,7 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -345,7 +345,7 @@ ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL]] ; GCN: buffer_store_dword [[NEG_A]] -define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -368,7 +368,7 @@ ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-NEXT: buffer_store_dword [[NEG_MUL]] ; GCN: buffer_store_dword [[MUL]] -define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { +define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -394,7 +394,7 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -412,7 +412,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -428,7 +428,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0 ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -444,7 +444,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0 ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -460,7 +460,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -476,7 +476,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0 ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -494,7 +494,7 @@ ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]] ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -516,7 +516,7 @@ ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]] ; GCN-NEXT: buffer_store_dword [[MAX0]] ; GCN-NEXT: buffer_store_dword [[MUL1]] -define void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -541,7 +541,7 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -559,7 +559,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -575,7 +575,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0 ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -591,7 +591,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0 ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -607,7 +607,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -623,7 +623,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0 ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -641,7 +641,7 @@ ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -663,7 +663,7 @@ ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]] ; GCN-NEXT: buffer_store_dword [[MAX0]] ; GCN-NEXT: buffer_store_dword [[MUL1]] -define void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -693,7 +693,7 @@ ; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]] -define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -717,7 +717,7 @@ ; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] ; GCN-NEXT: buffer_store_dword [[NEG_FMA]] ; GCN-NEXT: buffer_store_dword [[FMA]] -define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -748,7 +748,7 @@ ; GCN-NEXT: buffer_store_dword [[NEG_FMA]] ; GCN-NEXT: buffer_store_dword [[MUL]] -define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -776,7 +776,7 @@ ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]] -define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -803,7 +803,7 @@ ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]] -define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -830,7 +830,7 @@ ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]] -define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -858,7 +858,7 @@ ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]] -define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -886,7 +886,7 @@ ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]] -define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -916,7 +916,7 @@ ; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]] ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]] -define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -946,7 +946,7 @@ ; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_FMA]] ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]] -define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 { +define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -979,7 +979,7 @@ ; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]] -define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1009,7 +1009,7 @@ ; GCN: buffer_store_dword [[NEG_MAD]] ; GCN-NEXT: buffer_store_dword [[MUL]] -define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1035,7 +1035,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]] ; GCN: buffer_store_dwordx2 [[RESULT]] -define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1051,7 +1051,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]] ; GCN: buffer_store_dwordx2 [[RESULT]] -define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1070,7 +1070,7 @@ ; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]] ; GCN: buffer_store_dwordx2 [[RESULT]] ; GCN: buffer_store_dword [[FNEG_A]] -define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1090,7 +1090,7 @@ ; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]] ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}} ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}} -define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1110,7 +1110,7 @@ ; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}} ; GCN: buffer_store_dwordx2 [[MUL]] -define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1126,7 +1126,7 @@ ; FIXME: Source modifiers not folded for f16->f32 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32: -define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext @@ -1140,7 +1140,7 @@ } ; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32: -define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext @@ -1162,7 +1162,7 @@ ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] ; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext @@ -1178,7 +1178,7 @@ ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] ; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext @@ -1197,7 +1197,7 @@ ; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]] ; GCN: buffer_store_dword [[RESULT]] ; GCN: buffer_store_dwordx2 v{{\[}}[[A_LO]]:[[NEG_A_HI]]{{\]}} -define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext @@ -1217,7 +1217,7 @@ ; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}} ; GCN: buffer_store_dword [[RESULT]] ; GCN: buffer_store_dwordx2 [[USE1]] -define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 { +define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext @@ -1236,7 +1236,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]] ; GCN: buffer_store_short [[RESULT]] -define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1252,7 +1252,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] ; GCN: buffer_store_short [[RESULT]] -define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1271,7 +1271,7 @@ ; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]] ; GCN: buffer_store_dword [[NEG]] ; GCN: buffer_store_dword [[CVT]] -define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext @@ -1290,7 +1290,7 @@ ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] ; GCN: buffer_store_short [[RESULT]] ; GCN: buffer_store_dword [[NEG_A]] -define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1310,7 +1310,7 @@ ; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s ; GCN: buffer_store_short [[RESULT]] ; GCN: buffer_store_dword [[USE1]] -define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { +define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1333,7 +1333,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1349,7 +1349,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1368,7 +1368,7 @@ ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] ; GCN: buffer_store_dword [[RESULT]] ; GCN: buffer_store_dword [[NEG_A]] -define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1388,7 +1388,7 @@ ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN: buffer_store_dword [[RESULT]] ; GCN: buffer_store_dword [[MUL]] -define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { +define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1411,7 +1411,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1432,7 +1432,7 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]] ; GCN-NEXT: buffer_store_dword [[RESULT]] -define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1453,7 +1453,7 @@ ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[ADD]] -define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1476,7 +1476,7 @@ ; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[MUL]] -define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1497,7 +1497,7 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1517,7 +1517,7 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1537,7 +1537,7 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1560,7 +1560,7 @@ ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[NEG_A]] -define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1583,7 +1583,7 @@ ; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[MUL]] -define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1610,7 +1610,7 @@ ; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]] ; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1626,7 +1626,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1646,7 +1646,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1673,7 +1673,7 @@ ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}} ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1693,7 +1693,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1713,7 +1713,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1735,7 +1735,7 @@ ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]] ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]] -define void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1758,7 +1758,7 @@ ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]] ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]] -define void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1791,7 +1791,7 @@ ; GCN: buffer_store_dword [[MUL1]] ; GCN: buffer_store_dword [[MUL0]] -define void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { +define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1827,7 +1827,7 @@ ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] ; GCN: ; use [[MUL]] ; GCN: buffer_store_dword [[MUL]] -define void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { +define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1856,7 +1856,7 @@ ; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]] ; GCN: ; use [[NEG]] ; GCN: buffer_store_dword [[MUL]] -define void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { +define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1888,7 +1888,7 @@ ; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0 ; GCN-NEXT: buffer_store_dword [[FMA0]] ; GCN-NEXT: buffer_store_dword [[FMA1]] -define void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1920,7 +1920,7 @@ ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] ; GCN-NEXT: buffer_store_dword [[MUL0]] ; GCN-NEXT: buffer_store_dword [[MUL1]] -define void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1951,7 +1951,7 @@ ; GCN: buffer_store_dword [[FMA0]] ; GCN-NEXT: buffer_store_dword [[MUL1]] -define void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1990,7 +1990,7 @@ ; GCN: buffer_store_dword [[MUL1]] ; GCN-NEXT: buffer_store_dword [[MUL2]] -define void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { +define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -2025,7 +2025,7 @@ ; GCN: buffer_store_dwordx2 [[MUL0]] ; GCN: buffer_store_dwordx2 [[MUL1]] -define void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 { +define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext @@ -2058,7 +2058,7 @@ ; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] ; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] ; GCN: buffer_store_dword [[FMA0]] -define void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { +define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -2088,7 +2088,7 @@ ; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]] ; GCN: buffer_store_dword [[FMA0]] ; GCN: buffer_store_dword [[MUL1]] -define void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { +define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext Index: test/CodeGen/AMDGPU/fneg-fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -9,7 +9,7 @@ ; GFX89-NOT: _and ; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}| -define void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) { +define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) { %fabs = call half @llvm.fabs.f16(half %x) %fsub = fsub half -0.0, %fabs %fadd = fadd half %y, %fsub @@ -27,7 +27,7 @@ ; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}| ; GFX89-NOT: [[MUL]] ; GFX89: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] -define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) { +define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) { %fabs = call half @llvm.fabs.f16(half %x) %fsub = fsub half -0.0, %fabs %fmul = fmul half %y, %fsub @@ -41,7 +41,7 @@ ; GCN-LABEL: {{^}}fneg_fabs_free_f16: ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} -define void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) { +define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) { %bc = bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) %fsub = fsub half -0.0, %fabs @@ -51,7 +51,7 @@ ; GCN-LABEL: {{^}}fneg_fabs_f16: ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} -define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) { +define amdgpu_kernel void @fneg_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) %fsub = fsub half -0.0, %fabs store half %fsub, half addrspace(1)* %out, align 2 @@ -60,7 +60,7 @@ ; GCN-LABEL: {{^}}v_fneg_fabs_f16: ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} -define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) { +define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %val = load half, half addrspace(1)* %in, align 2 %fabs = call half @llvm.fabs.f16(half %val) %fsub = fsub half -0.0, %fabs @@ -76,7 +76,7 @@ ; CIVI: flat_store_dword ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}} -define void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { +define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fneg.fabs = fsub <2 x half> , %fabs store <2 x half> %fneg.fabs, <2 x half> addrspace(1)* %out @@ -95,7 +95,7 @@ ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}} ; GCN: flat_store_dwordx2 -define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { +define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) %fsub = fsub <4 x half> , %fabs store <4 x half> %fsub, <4 x half> addrspace(1)* %out @@ -113,7 +113,7 @@ ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0] -define void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 { +define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fneg.fabs = fsub <2 x half> , %fabs %mul = fmul <2 x half> %fneg.fabs, @@ -125,7 +125,7 @@ ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff ; GFX9: v_mov_b32_e32 [[VABS:v[0-9]+]], [[ABS]] ; GFX9: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VABS]] -define void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) { +define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fneg = fsub <2 x half> , %fabs store <2 x half> %fabs, <2 x half> addrspace(1)* %out0 @@ -136,7 +136,7 @@ ; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_foldable_neg_v2f16: ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0] -define void @s_fneg_multi_use_fabs_foldable_neg_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) { +define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fneg = fsub <2 x half> , %fabs %mul = fmul <2 x half> %fneg, Index: test/CodeGen/AMDGPU/fneg-fabs.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}fneg_fabs_fadd_f64: ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}} -define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) { +define amdgpu_kernel void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) { %fabs = call double @llvm.fabs.f64(double %x) %fsub = fsub double -0.000000e+00, %fabs %fadd = fadd double %y, %fsub @@ -14,7 +14,7 @@ ret void } -define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) { +define amdgpu_kernel void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) { %x = load double, double addrspace(1)* %xptr, align 8 %y = load double, double addrspace(1)* %xptr, align 8 %fabs = call double @llvm.fabs.f64(double %x) @@ -26,7 +26,7 @@ ; GCN-LABEL: {{^}}fneg_fabs_fmul_f64: ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}} -define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) { +define amdgpu_kernel void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) { %fabs = call double @llvm.fabs.f64(double %x) %fsub = fsub double -0.000000e+00, %fabs %fmul = fmul double %y, %fsub @@ -35,7 +35,7 @@ } ; GCN-LABEL: {{^}}fneg_fabs_free_f64: -define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) { %bc = bitcast i64 %in to double %fabs = call double @llvm.fabs.f64(double %bc) %fsub = fsub double -0.000000e+00, %fabs @@ -46,7 +46,7 @@ ; GCN-LABEL: {{^}}fneg_fabs_fn_free_f64: ; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}} ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { %bc = bitcast i64 %in to double %fabs = call double @fabs(double %bc) %fsub = fsub double -0.000000e+00, %fabs @@ -62,7 +62,7 @@ ; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]] ; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}} -define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, double %in) { %fabs = call double @llvm.fabs.f64(double %in) %fsub = fsub double -0.000000e+00, %fabs store double %fsub, double addrspace(1)* %out, align 8 @@ -74,7 +74,7 @@ ; GCN-NOT: 0x80000000 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) %fsub = fsub <2 x double> , %fabs store <2 x double> %fsub, <2 x double> addrspace(1)* %out @@ -88,7 +88,7 @@ ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) %fsub = fsub <4 x double> , %fabs store <4 x double> %fsub, <4 x double> addrspace(1)* %out Index: test/CodeGen/AMDGPU/fneg-fabs.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.ll +++ test/CodeGen/AMDGPU/fneg-fabs.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: ; SI-NOT: and ; SI: v_subrev_f32_e64 {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}} -define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { +define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) %fsub = fsub float -0.000000e+00, %fabs %fadd = fadd float %y, %fsub @@ -17,7 +17,7 @@ ; SI-NOT: and ; SI: v_mul_f32_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|, {{s[0-9]+}} ; SI-NOT: and -define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { +define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) %fsub = fsub float -0.000000e+00, %fabs %fmul = fmul float %y, %fsub @@ -35,7 +35,7 @@ ; R600: -PV ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fabs = call float @llvm.fabs.f32(float %bc) %fsub = fsub float -0.000000e+00, %fabs @@ -49,7 +49,7 @@ ; R600: -PV ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fabs = call float @fabs(float %bc) %fsub = fsub float -0.000000e+00, %fabs @@ -59,7 +59,7 @@ ; FUNC-LABEL: {{^}}fneg_fabs_f32: ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fneg_fabs_f32(float addrspace(1)* %out, float %in) { %fabs = call float @llvm.fabs.f32(float %in) %fsub = fsub float -0.000000e+00, %fabs store float %fsub, float addrspace(1)* %out, align 4 @@ -68,7 +68,7 @@ ; FUNC-LABEL: {{^}}v_fneg_fabs_f32: ; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} -define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %val = load float, float addrspace(1)* %in, align 4 %fabs = call float @llvm.fabs.f32(float %val) %fsub = fsub float -0.000000e+00, %fabs @@ -86,7 +86,7 @@ ; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} -define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> , %fabs store <2 x float> %fsub, <2 x float> addrspace(1)* %out @@ -99,7 +99,7 @@ ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} -define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) %fsub = fsub <4 x float> , %fabs store <4 x float> %fsub, <4 x float> addrspace(1)* %out Index: test/CodeGen/AMDGPU/fneg.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f16.ll +++ test/CodeGen/AMDGPU/fneg.f16.ll @@ -4,7 +4,7 @@ ; FIXME: Should be able to do scalar op ; GCN-LABEL: {{^}}s_fneg_f16: -define void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 { +define amdgpu_kernel void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 { %fneg = fsub half -0.0, %in store half %fneg, half addrspace(1)* %out ret void @@ -18,7 +18,7 @@ ; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] ; SI: buffer_store_short [[XOR]] -define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid %gep.out = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid @@ -34,7 +34,7 @@ ; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}} ; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] -define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 { +define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 { %bc = bitcast i16 %in to half %fsub = fsub half -0.0, %bc store half %fsub, half addrspace(1)* %out @@ -52,7 +52,7 @@ ; VI-NOT: [[NEG_VALUE]] ; VI: v_mul_f16_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] -define void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %val = load half, half addrspace(1)* %in %fsub = fsub half -0.0, %val %fmul = fmul half %fsub, %val @@ -73,7 +73,7 @@ ; VI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} -define void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 { +define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 { %fneg = fsub <2 x half> , %in store <2 x half> %fneg, <2 x half> addrspace(1)* %out ret void @@ -82,7 +82,7 @@ ; GCN-LABEL: {{^}}v_fneg_v2f16: ; GCN: flat_load_dword [[VAL:v[0-9]+]] ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]] -define void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid @@ -98,7 +98,7 @@ ; GFX9: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VVAL]] -define void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { %bc = bitcast i32 %in to <2 x half> %fsub = fsub <2 x half> , %bc store <2 x half> %fsub, <2 x half> addrspace(1)* %out @@ -120,7 +120,7 @@ ; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}} -define void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in %fsub = fsub <2 x half> , %val %fmul = fmul <2 x half> %fsub, %val Index: test/CodeGen/AMDGPU/fneg.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f64.ll +++ test/CodeGen/AMDGPU/fneg.f64.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}fneg_f64: ; GCN: v_xor_b32 -define void @fneg_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fneg_f64(double addrspace(1)* %out, double %in) { %fneg = fsub double -0.000000e+00, %in store double %fneg, double addrspace(1)* %out ret void @@ -12,7 +12,7 @@ ; FUNC-LABEL: {{^}}fneg_v2f64: ; GCN: v_xor_b32 ; GCN: v_xor_b32 -define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) { +define amdgpu_kernel void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) { %fneg = fsub <2 x double> , %in store <2 x double> %fneg, <2 x double> addrspace(1)* %out ret void @@ -28,7 +28,7 @@ ; GCN: v_xor_b32 ; GCN: v_xor_b32 ; GCN: v_xor_b32 -define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) { +define amdgpu_kernel void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) { %fneg = fsub <4 x double> , %in store <4 x double> %fneg, <4 x double> addrspace(1)* %out ret void @@ -40,7 +40,7 @@ ; FUNC-LABEL: {{^}}fneg_free_f64: ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{s\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @fneg_free_f64(double addrspace(1)* %out, i64 %in) { %bc = bitcast i64 %in to double %fsub = fsub double 0.0, %bc store double %fsub, double addrspace(1)* %out @@ -52,7 +52,7 @@ ; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN-NOT: xor ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]] -define void @fneg_fold_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, double %in) { %fsub = fsub double -0.0, %in %fmul = fmul double %fsub, %in store double %fmul, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/fneg.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.ll +++ test/CodeGen/AMDGPU/fneg.ll @@ -6,7 +6,7 @@ ; R600: -PV ; GCN: v_xor_b32 -define void @s_fneg_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @s_fneg_f32(float addrspace(1)* %out, float %in) { %fneg = fsub float -0.000000e+00, %in store float %fneg, float addrspace(1)* %out ret void @@ -18,7 +18,7 @@ ; GCN: v_xor_b32 ; GCN: v_xor_b32 -define void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) { +define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) { %fneg = fsub <2 x float> , %in store <2 x float> %fneg, <2 x float> addrspace(1)* %out ret void @@ -34,7 +34,7 @@ ; GCN: v_xor_b32 ; GCN: v_xor_b32 ; GCN: v_xor_b32 -define void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) { +define amdgpu_kernel void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) { %fneg = fsub <4 x float> , %in store <4 x float> %fneg, <4 x float> addrspace(1)* %out ret void @@ -50,7 +50,7 @@ ; R600-NOT: XOR ; R600: -KC0[2].Z -define void @fsub0_f32(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @fsub0_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fsub = fsub float 0.0, %bc store float %fsub, float addrspace(1)* %out @@ -66,7 +66,7 @@ ; R600-NOT: XOR ; R600: -PV.W -define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @fneg_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fsub = fsub float -0.0, %bc store float %fsub, float addrspace(1)* %out @@ -78,7 +78,7 @@ ; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN-NOT: xor ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] -define void @fneg_fold_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fneg_fold_f32(float addrspace(1)* %out, float %in) { %fsub = fsub float -0.0, %in %fmul = fmul float %fsub, %in store float %fmul, float addrspace(1)* %out Index: test/CodeGen/AMDGPU/fp-classify.ll =================================================================== --- test/CodeGen/AMDGPU/fp-classify.ll +++ test/CodeGen/AMDGPU/fp-classify.ll @@ -9,7 +9,7 @@ ; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] ; SI-NOT: v_cmp ; SI: s_endpgm -define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 { %fabs = tail call float @llvm.fabs.f32(float %x) #1 %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 %ext = zext i1 %cmp to i32 @@ -20,7 +20,7 @@ ; SI-LABEL: {{^}}test_not_isinf_pattern_0: ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { %fabs = tail call float @llvm.fabs.f32(float %x) #1 %cmp = fcmp ueq float %fabs, 0x7FF0000000000000 %ext = zext i1 %cmp to i32 @@ -31,7 +31,7 @@ ; SI-LABEL: {{^}}test_not_isinf_pattern_1: ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { %fabs = tail call float @llvm.fabs.f32(float %x) #1 %cmp = fcmp oeq float %fabs, 0xFFF0000000000000 %ext = zext i1 %cmp to i32 @@ -45,7 +45,7 @@ ; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] ; SI-NOT: v_cmp ; SI: s_endpgm -define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 @@ -59,7 +59,7 @@ ; SI-LABEL: {{^}}test_isfinite_not_pattern_0: ; SI-NOT: v_cmp_class_f32 ; SI: s_endpgm -define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp une float %x.fabs, 0xFFF0000000000000 @@ -73,7 +73,7 @@ ; SI-LABEL: {{^}}test_isfinite_not_pattern_1: ; SI-NOT: v_cmp_class_f32 ; SI: s_endpgm -define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { %ord = fcmp ord float %x, 0.000000e+00 %ninf = fcmp une float %x, 0x7FF0000000000000 %and = and i1 %ord, %ninf @@ -86,7 +86,7 @@ ; SI-LABEL: {{^}}test_isfinite_not_pattern_2: ; SI-NOT: v_cmp_class_f32 ; SI: s_endpgm -define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 { +define amdgpu_kernel void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 { %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %y) #1 %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 @@ -100,7 +100,7 @@ ; SI-LABEL: {{^}}test_isfinite_not_pattern_3: ; SI-NOT: v_cmp_class_f32 ; SI: s_endpgm -define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 { %ord = fcmp uno float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 @@ -114,7 +114,7 @@ ; SI-LABEL: {{^}}test_isfinite_not_pattern_4: ; SI-NOT: v_cmp_class_f32 ; SI: s_endpgm -define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 { %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp one float %x.fabs, 0x7FF0000000000000 Index: test/CodeGen/AMDGPU/fp16_to_fp32.ll =================================================================== --- test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -14,7 +14,7 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD [[RES:T[0-9]+\.[XYZW]]] ; EGCM: VTX_READ_16 [[VAL:T[0-9]+\.[XYZW]]] ; EGCM: FLT16_TO_FLT32{{[ *]*}}[[RES]], [[VAL]] -define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { %val = load i16, i16 addrspace(1)* %in, align 2 %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone store float %cvt, float addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/fp16_to_fp64.ll =================================================================== --- test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -8,7 +8,7 @@ ; GCN: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]] ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]] ; GCN: buffer_store_dwordx2 [[RESULT]] -define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { %val = load i16, i16 addrspace(1)* %in, align 2 %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone store double %cvt, double addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/fp32_to_fp16.ll =================================================================== --- test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -12,7 +12,7 @@ ; EG: MEM_RAT MSKOR ; EG: VTX_READ_32 ; EG: FLT32_TO_FLT16 -define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %val = load float, float addrspace(1)* %in, align 4 %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone store i16 %cvt, i16 addrspace(1)* %out, align 2 Index: test/CodeGen/AMDGPU/fp_to_sint.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fp_to_sint.f64.ll +++ test/CodeGen/AMDGPU/fp_to_sint.f64.ll @@ -6,7 +6,7 @@ ; FUNC-LABEL: @fp_to_sint_f64_i32 ; SI: v_cvt_i32_f64_e32 -define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) { %result = fptosi double %in to i32 store i32 %result, i32 addrspace(1)* %out ret void @@ -15,7 +15,7 @@ ; FUNC-LABEL: @fp_to_sint_v2f64_v2i32 ; SI: v_cvt_i32_f64_e32 ; SI: v_cvt_i32_f64_e32 -define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) { %result = fptosi <2 x double> %in to <2 x i32> store <2 x i32> %result, <2 x i32> addrspace(1)* %out ret void @@ -26,7 +26,7 @@ ; SI: v_cvt_i32_f64_e32 ; SI: v_cvt_i32_f64_e32 ; SI: v_cvt_i32_f64_e32 -define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) { %result = fptosi <4 x double> %in to <4 x i32> store <4 x i32> %result, <4 x i32> addrspace(1)* %out ret void @@ -47,7 +47,7 @@ ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]] ; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]] ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr double, double addrspace(1)* %in, i32 %tid %val = load double, double addrspace(1)* %gep, align 8 @@ -58,7 +58,7 @@ ; FUNC-LABEL: {{^}}fp_to_sint_f64_to_i1: ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{\[[0-9]+:[0-9]+\]}} -define void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { +define amdgpu_kernel void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { %conv = fptosi double %in to i1 store i1 %conv, i1 addrspace(1)* %out ret void @@ -66,7 +66,7 @@ ; FUNC-LABEL: {{^}}fp_to_sint_fabs_f64_to_i1: ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{\[[0-9]+:[0-9]+\]}}| -define void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { +define amdgpu_kernel void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { %in.fabs = call double @llvm.fabs.f64(double %in) %conv = fptosi double %in.fabs to i1 store i1 %conv, i1 addrspace(1)* %out Index: test/CodeGen/AMDGPU/fp_to_sint.ll =================================================================== --- test/CodeGen/AMDGPU/fp_to_sint.ll +++ test/CodeGen/AMDGPU/fp_to_sint.ll @@ -8,7 +8,7 @@ ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} ; SI: v_cvt_i32_f32_e32 ; SI: s_endpgm -define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) { %conv = fptosi float %in to i32 store i32 %conv, i32 addrspace(1)* %out ret void @@ -16,7 +16,7 @@ ; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs: ; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}} -define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) { %in.fabs = call float @llvm.fabs.f32(float %in) %conv = fptosi float %in.fabs to i32 store i32 %conv, i32 addrspace(1)* %out @@ -28,7 +28,7 @@ ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} ; SI: v_cvt_i32_f32_e32 ; SI: v_cvt_i32_f32_e32 -define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { %result = fptosi <2 x float> %in to <2 x i32> store <2 x i32> %result, <2 x i32> addrspace(1)* %out ret void @@ -43,7 +43,7 @@ ; SI: v_cvt_i32_f32_e32 ; SI: v_cvt_i32_f32_e32 ; SI: v_cvt_i32_f32_e32 -define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +define amdgpu_kernel void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %value = load <4 x float>, <4 x float> addrspace(1) * %in %result = fptosi <4 x float> %value to <4 x i32> store <4 x i32> %result, <4 x i32> addrspace(1)* %out @@ -76,7 +76,7 @@ ; Check that the compiler doesn't crash with a "cannot select" error ; SI: s_endpgm -define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) { entry: %0 = fptosi float %in to i64 store i64 %0, i64 addrspace(1)* %out @@ -128,7 +128,7 @@ ; EG-DAG: CNDE_INT ; SI: s_endpgm -define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { +define amdgpu_kernel void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { %conv = fptosi <2 x float> %x to <2 x i64> store <2 x i64> %conv, <2 x i64> addrspace(1)* %out ret void @@ -221,7 +221,7 @@ ; EG-DAG: CNDE_INT ; SI: s_endpgm -define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { +define amdgpu_kernel void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { %conv = fptosi <4 x float> %x to <4 x i64> store <4 x i64> %conv, <4 x i64> addrspace(1)* %out ret void @@ -233,7 +233,7 @@ ; EG: AND_INT ; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, literal.y, ; EG-NEXT: -1082130432(-1.000000e+00) -define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { %conv = fptosi float %in to i1 store i1 %conv, i1 addrspace(1)* %out ret void @@ -241,7 +241,7 @@ ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1: ; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{[0-9]+}}| -define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { %in.fabs = call float @llvm.fabs.f32(float %in) %conv = fptosi float %in.fabs to i1 store i1 %conv, i1 addrspace(1)* %out @@ -251,7 +251,7 @@ ; FUNC-LABEL: {{^}}fp_to_sint_f32_i16: ; GCN: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} ; GCN: buffer_store_short [[VAL]] -define void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 { %sint = fptosi float %in to i16 store i16 %sint, i16 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/fp_to_uint.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fp_to_uint.f64.ll +++ test/CodeGen/AMDGPU/fp_to_uint.f64.ll @@ -6,7 +6,7 @@ ; SI-LABEL: {{^}}fp_to_uint_i32_f64: ; SI: v_cvt_u32_f64_e32 -define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) { %cast = fptoui double %in to i32 store i32 %cast, i32 addrspace(1)* %out, align 4 ret void @@ -15,7 +15,7 @@ ; SI-LABEL: @fp_to_uint_v2i32_v2f64 ; SI: v_cvt_u32_f64_e32 ; SI: v_cvt_u32_f64_e32 -define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) { %cast = fptoui <2 x double> %in to <2 x i32> store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8 ret void @@ -26,7 +26,7 @@ ; SI: v_cvt_u32_f64_e32 ; SI: v_cvt_u32_f64_e32 ; SI: v_cvt_u32_f64_e32 -define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) { %cast = fptoui <4 x double> %in to <4 x i32> store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8 ret void @@ -47,7 +47,7 @@ ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]] ; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]] ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr double, double addrspace(1)* %in, i32 %tid %val = load double, double addrspace(1)* %gep, align 8 @@ -57,14 +57,14 @@ } ; SI-LABEL: @fp_to_uint_v2i64_v2f64 -define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) { %cast = fptoui <2 x double> %in to <2 x i64> store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16 ret void } ; SI-LABEL: @fp_to_uint_v4i64_v4f64 -define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) { %cast = fptoui <4 x double> %in to <4 x i64> store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32 ret void @@ -72,7 +72,7 @@ ; FUNC-LABEL: {{^}}fp_to_uint_f64_to_i1: ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{\[[0-9]+:[0-9]+\]}} -define void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { +define amdgpu_kernel void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { %conv = fptoui double %in to i1 store i1 %conv, i1 addrspace(1)* %out ret void @@ -80,7 +80,7 @@ ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f64_to_i1: ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{\[[0-9]+:[0-9]+\]}}| -define void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { +define amdgpu_kernel void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { %in.fabs = call double @llvm.fabs.f64(double %in) %conv = fptoui double %in.fabs to i1 store i1 %conv, i1 addrspace(1)* %out Index: test/CodeGen/AMDGPU/fp_to_uint.ll =================================================================== --- test/CodeGen/AMDGPU/fp_to_uint.ll +++ test/CodeGen/AMDGPU/fp_to_uint.ll @@ -9,7 +9,7 @@ ; GCN: v_cvt_u32_f32_e32 ; GCN: s_endpgm -define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) { %conv = fptoui float %in to i32 store i32 %conv, i32 addrspace(1)* %out ret void @@ -21,7 +21,7 @@ ; GCN: v_cvt_u32_f32_e32 ; GCN: v_cvt_u32_f32_e32 -define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { %result = fptoui <2 x float> %in to <2 x i32> store <2 x i32> %result, <2 x i32> addrspace(1)* %out ret void @@ -37,7 +37,7 @@ ; GCN: v_cvt_u32_f32_e32 ; GCN: v_cvt_u32_f32_e32 -define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %value = load <4 x float>, <4 x float> addrspace(1) * %in %result = fptoui <4 x float> %value to <4 x i32> store <4 x i32> %result, <4 x i32> addrspace(1)* %out @@ -68,7 +68,7 @@ ; EG-DAG: CNDE_INT ; GCN: s_endpgm -define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) { +define amdgpu_kernel void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) { %conv = fptoui float %x to i64 store i64 %conv, i64 addrspace(1)* %out ret void @@ -119,7 +119,7 @@ ; EG-DAG: CNDE_INT ; GCN: s_endpgm -define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { +define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { %conv = fptoui <2 x float> %x to <2 x i64> store <2 x i64> %conv, <2 x i64> addrspace(1)* %out ret void @@ -212,7 +212,7 @@ ; EG-DAG: CNDE_INT ; GCN: s_endpgm -define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { +define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { %conv = fptoui <4 x float> %x to <4 x i64> store <4 x i64> %conv, <4 x i64> addrspace(1)* %out ret void @@ -224,7 +224,7 @@ ; EG: AND_INT ; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, 1.0, -define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { %conv = fptoui float %in to i1 store i1 %conv, i1 addrspace(1)* %out ret void @@ -232,7 +232,7 @@ ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1: ; GCN: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{[0-9]+}}| -define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { %in.fabs = call float @llvm.fabs.f32(float %in) %conv = fptoui float %in.fabs to i1 store i1 %conv, i1 addrspace(1)* %out @@ -246,7 +246,7 @@ ; SI: v_cvt_u32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} ; VI: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} ; GCN: buffer_store_short [[VAL]] -define void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 { %uint = fptoui float %in to i16 store i16 %uint, i16 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/fpext.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fpext.f16.ll +++ test/CodeGen/AMDGPU/fpext.f16.ll @@ -7,7 +7,7 @@ ; GCN: v_cvt_f32_f16_e32 v[[R_F32:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_dword v[[R_F32]] ; GCN: s_endpgm -define void @fpext_f16_to_f32( +define amdgpu_kernel void @fpext_f16_to_f32( float addrspace(1)* %r, half addrspace(1)* %a) #0 { entry: @@ -23,7 +23,7 @@ ; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:[[R_F64_1:[0-9]+]]{{\]}}, v[[A_F32]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F64_0]]:[[R_F64_1]]{{\]}} ; GCN: s_endpgm -define void @fpext_f16_to_f64( +define amdgpu_kernel void @fpext_f16_to_f64( double addrspace(1)* %r, half addrspace(1)* %a) #0 { entry: @@ -41,7 +41,7 @@ ; GCN: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}} ; GCN: s_endpgm -define void @fpext_v2f16_to_v2f32( +define amdgpu_kernel void @fpext_v2f16_to_v2f32( <2 x float> addrspace(1)* %r, <2 x half> addrspace(1)* %a) #0 { entry: @@ -61,7 +61,7 @@ ; GCN: v_cvt_f64_f32_e32 ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -define void @fpext_v2f16_to_v2f64( +define amdgpu_kernel void @fpext_v2f16_to_v2f64( <2 x double> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: @@ -73,7 +73,7 @@ ; GCN-LABEL: {{^}}s_fneg_fpext_f16_to_f32: ; GCN: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}} -define void @s_fneg_fpext_f16_to_f32(float addrspace(1)* %r, i32 %a) { +define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(float addrspace(1)* %r, i32 %a) { entry: %a.trunc = trunc i32 %a to i16 %a.val = bitcast i16 %a.trunc to half @@ -85,7 +85,7 @@ ; GCN-LABEL: {{^}}fneg_fpext_f16_to_f32: ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] ; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -[[A]] -define void @fneg_fpext_f16_to_f32( +define amdgpu_kernel void @fneg_fpext_f16_to_f32( float addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -99,7 +99,7 @@ ; GCN-LABEL: {{^}}fabs_fpext_f16_to_f32: ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] ; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, |[[A]]| -define void @fabs_fpext_f16_to_f32( +define amdgpu_kernel void @fabs_fpext_f16_to_f32( float addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -113,7 +113,7 @@ ; GCN-LABEL: {{^}}fneg_fabs_fpext_f16_to_f32: ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] ; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|[[A]]| -define void @fneg_fabs_fpext_f16_to_f32( +define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( float addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -135,7 +135,7 @@ ; GCN: store_dword [[CVT]] ; GCN: store_short [[XOR]] -define void @fneg_multi_use_fpext_f16_to_f32( +define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( float addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -159,7 +159,7 @@ ; GCN: buffer_store_dword [[CVTA_NEG]] ; GCN: buffer_store_short [[MUL]] -define void @fneg_multi_foldable_use_fpext_f16_to_f32( +define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( float addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -181,7 +181,7 @@ ; GCN: store_dword [[CVT]] ; GCN: store_short [[XOR]] -define void @fabs_multi_use_fpext_f16_to_f32( +define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( float addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -205,7 +205,7 @@ ; GCN: buffer_store_dword [[ABS_A]] ; GCN: buffer_store_short [[MUL]] -define void @fabs_multi_foldable_use_fpext_f16_to_f32( +define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( float addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -227,7 +227,7 @@ ; GCN: buffer_store_dword [[CVT]] ; GCN: buffer_store_short [[OR]] -define void @fabs_fneg_multi_use_fpext_f16_to_f32( +define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( float addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -252,7 +252,7 @@ ; GCN: buffer_store_dword [[FABS_FNEG]] ; GCN: buffer_store_short [[MUL]] -define void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( +define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( float addrspace(1)* %r, half addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/fpext.ll =================================================================== --- test/CodeGen/AMDGPU/fpext.ll +++ test/CodeGen/AMDGPU/fpext.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}fpext_f32_to_f64: ; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) { %result = fpext float %in to double store double %result, double addrspace(1)* %out ret void @@ -12,7 +12,7 @@ ; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64: ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 -define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) { %result = fpext <2 x float> %in to <2 x double> store <2 x double> %result, <2 x double> addrspace(1)* %out ret void @@ -22,7 +22,7 @@ ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 -define void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) { +define amdgpu_kernel void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) { %result = fpext <3 x float> %in to <3 x double> store <3 x double> %result, <3 x double> addrspace(1)* %out ret void @@ -33,7 +33,7 @@ ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 -define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) { %result = fpext <4 x float> %in to <4 x double> store <4 x double> %result, <4 x double> addrspace(1)* %out ret void @@ -48,7 +48,7 @@ ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 -define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) { +define amdgpu_kernel void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) { %result = fpext <8 x float> %in to <8 x double> store <8 x double> %result, <8 x double> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/fptosi.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fptosi.f16.ll +++ test/CodeGen/AMDGPU/fptosi.f16.ll @@ -7,7 +7,7 @@ ; GCN: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_I16]] ; GCN: s_endpgm -define void @fptosi_f16_to_i16( +define amdgpu_kernel void @fptosi_f16_to_i16( i16 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -23,7 +23,7 @@ ; GCN: v_cvt_i32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fptosi_f16_to_i32( +define amdgpu_kernel void @fptosi_f16_to_i32( i32 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -40,7 +40,7 @@ ; GCN: buffer_load_ushort ; GCN: v_cvt_f32_f16_e32 ; GCN: s_endpgm -define void @fptosi_f16_to_i64( +define amdgpu_kernel void @fptosi_f16_to_i64( i64 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -62,7 +62,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]] ; GCN: buffer_store_dword v[[R_V2_I16]] ; GCN: s_endpgm -define void @fptosi_v2f16_to_v2i16( +define amdgpu_kernel void @fptosi_v2f16_to_v2i16( <2 x i16> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: @@ -80,7 +80,7 @@ ; GCN: v_cvt_i32_f32_e32 ; GCN: buffer_store_dwordx2 ; GCN: s_endpgm -define void @fptosi_v2f16_to_v2i32( +define amdgpu_kernel void @fptosi_v2f16_to_v2i32( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: @@ -98,7 +98,7 @@ ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 ; GCN: s_endpgm -define void @fptosi_v2f16_to_v2i64( +define amdgpu_kernel void @fptosi_v2f16_to_v2i64( <2 x i64> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/fptoui.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fptoui.f16.ll +++ test/CodeGen/AMDGPU/fptoui.f16.ll @@ -8,7 +8,7 @@ ; VI: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_I16]] ; GCN: s_endpgm -define void @fptoui_f16_to_i16( +define amdgpu_kernel void @fptoui_f16_to_i16( i16 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -24,7 +24,7 @@ ; GCN: v_cvt_u32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fptoui_f16_to_i32( +define amdgpu_kernel void @fptoui_f16_to_i32( i32 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -41,7 +41,7 @@ ; GCN: buffer_load_ushort ; GCN: v_cvt_f32_f16_e32 ; GCN: s_endpgm -define void @fptoui_f16_to_i64( +define amdgpu_kernel void @fptoui_f16_to_i64( i64 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -64,7 +64,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]] ; GCN: buffer_store_dword v[[R_V2_I16]] ; GCN: s_endpgm -define void @fptoui_v2f16_to_v2i16( +define amdgpu_kernel void @fptoui_v2f16_to_v2i16( <2 x i16> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: @@ -82,7 +82,7 @@ ; GCN: v_cvt_u32_f32_e32 ; GCN: buffer_store_dwordx2 ; GCN: s_endpgm -define void @fptoui_v2f16_to_v2i32( +define amdgpu_kernel void @fptoui_v2f16_to_v2i32( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: @@ -100,7 +100,7 @@ ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 ; GCN: s_endpgm -define void @fptoui_v2f16_to_v2i64( +define amdgpu_kernel void @fptoui_v2f16_to_v2i64( <2 x i64> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/fptrunc.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fptrunc.f16.ll +++ test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -8,7 +8,7 @@ ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fptrunc_f32_to_f16( +define amdgpu_kernel void @fptrunc_f32_to_f16( half addrspace(1)* %r, float addrspace(1)* %a) { entry: @@ -24,7 +24,7 @@ ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fptrunc_f64_to_f16( +define amdgpu_kernel void @fptrunc_f64_to_f16( half addrspace(1)* %r, double addrspace(1)* %a) { entry: @@ -48,7 +48,7 @@ ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fptrunc_v2f32_to_v2f16( +define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( <2 x half> addrspace(1)* %r, <2 x float> addrspace(1)* %a) { entry: @@ -74,7 +74,7 @@ ; GFX9-DENORM: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] -define void @fptrunc_v2f64_to_v2f16( +define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( <2 x half> addrspace(1)* %r, <2 x double> addrspace(1)* %a) { entry: @@ -89,7 +89,7 @@ ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -v[[A_F32]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fneg_fptrunc_f32_to_f16( +define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( half addrspace(1)* %r, float addrspace(1)* %a) { entry: @@ -105,7 +105,7 @@ ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]| ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fabs_fptrunc_f32_to_f16( +define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( half addrspace(1)* %r, float addrspace(1)* %a) { entry: @@ -121,7 +121,7 @@ ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -|v[[A_F32]]| ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fneg_fabs_fptrunc_f32_to_f16( +define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( half addrspace(1)* %r, float addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/fptrunc.ll =================================================================== --- test/CodeGen/AMDGPU/fptrunc.ll +++ test/CodeGen/AMDGPU/fptrunc.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}fptrunc_f64_to_f32: ; GCN: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} -define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) { %result = fptrunc double %in to float store float %result, float addrspace(1)* %out ret void @@ -14,7 +14,7 @@ ; GCN-NOT: v_cvt ; GCN-UNSAFE: v_cvt_f32_f64_e32 [[F32:v[0-9]+]] ; GCN-UNSAFE: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[F32]] -define void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) { %result = fptrunc double %in to half %result_i16 = bitcast half %result to i16 store i16 %result_i16, i16 addrspace(1)* %out @@ -24,7 +24,7 @@ ; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32: ; GCN: v_cvt_f32_f64_e32 ; GCN: v_cvt_f32_f64_e32 -define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) { %result = fptrunc <2 x double> %in to <2 x float> store <2 x float> %result, <2 x float> addrspace(1)* %out ret void @@ -35,7 +35,7 @@ ; GCN: v_cvt_f32_f64_e32 ; GCN: v_cvt_f32_f64_e32 ; GCN: v_cvt_f32_f64_e32 -define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) { %result = fptrunc <4 x double> %in to <4 x float> store <4 x float> %result, <4 x float> addrspace(1)* %out ret void @@ -50,7 +50,7 @@ ; GCN: v_cvt_f32_f64_e32 ; GCN: v_cvt_f32_f64_e32 ; GCN: v_cvt_f32_f64_e32 -define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) { +define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) { %result = fptrunc <8 x double> %in to <8 x float> store <8 x float> %result, <8 x float> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/fract.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fract.f64.ll +++ test/CodeGen/AMDGPU/fract.f64.ll @@ -27,7 +27,7 @@ ; GCN-UNSAFE: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]] ; GCN: buffer_store_dwordx2 [[FRACT]] -define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 { +define amdgpu_kernel void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 { %x = load double, double addrspace(1)* %src %floor.x = call double @llvm.floor.f64(double %x) %fract = fsub double %x, %floor.x @@ -54,7 +54,7 @@ ; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -[[X]] ; GCN: buffer_store_dwordx2 [[FRACT]] -define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 { +define amdgpu_kernel void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 { %x = load double, double addrspace(1)* %src %neg.x = fsub double -0.0, %x %floor.neg.x = call double @llvm.floor.f64(double %neg.x) @@ -82,7 +82,7 @@ ; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|[[X]]| ; GCN: buffer_store_dwordx2 [[FRACT]] -define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 { +define amdgpu_kernel void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 { %x = load double, double addrspace(1)* %src %abs.x = call double @llvm.fabs.f64(double %x) %neg.abs.x = fsub double -0.0, %abs.x @@ -98,7 +98,7 @@ ; VI-UNSAFE-DAG: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]] ; VI-UNSAFE: buffer_store_dwordx2 [[FLOOR]] ; VI-UNSAFE: buffer_store_dwordx2 [[FRACT]] -define void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 { +define amdgpu_kernel void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 { %x = load double, double addrspace(1)* %src %floor.x = call double @llvm.floor.f64(double %x) %fract = fsub double %x, %floor.x Index: test/CodeGen/AMDGPU/fract.ll =================================================================== --- test/CodeGen/AMDGPU/fract.ll +++ test/CodeGen/AMDGPU/fract.ll @@ -14,7 +14,7 @@ ; GCN-UNSAFE: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]] ; GCN: buffer_store_dword [[RESULT]] -define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 { +define amdgpu_kernel void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 { %x = load float, float addrspace(1)* %src %floor.x = call float @llvm.floor.f32(float %x) %fract = fsub float %x, %floor.x @@ -29,7 +29,7 @@ ; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]] ; GCN: buffer_store_dword [[RESULT]] -define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 { +define amdgpu_kernel void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 { %x = load float, float addrspace(1)* %src %x.neg = fsub float -0.0, %x %floor.x.neg = call float @llvm.floor.f32(float %x.neg) @@ -45,7 +45,7 @@ ; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]| ; GCN: buffer_store_dword [[RESULT]] -define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 { +define amdgpu_kernel void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 { %x = load float, float addrspace(1)* %src %abs.x = call float @llvm.fabs.f32(float %x) %neg.abs.x = fsub float -0.0, %abs.x @@ -61,7 +61,7 @@ ; GCN-UNSAFE: buffer_store_dword [[FLOOR]] ; GCN-UNSAFE: buffer_store_dword [[FRACT]] -define void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 { +define amdgpu_kernel void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 { %x = load float, float addrspace(1)* %src %floor.x = call float @llvm.floor.f32(float %x) %fract = fsub float %x, %floor.x Index: test/CodeGen/AMDGPU/frem.ll =================================================================== --- test/CodeGen/AMDGPU/frem.ll +++ test/CodeGen/AMDGPU/frem.ll @@ -15,7 +15,7 @@ ; GCN: v_trunc_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, +define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #0 { %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 %r0 = load float, float addrspace(1)* %in1, align 4 @@ -33,7 +33,7 @@ ; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]] ; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]] ; GCN: buffer_store_dword [[RESULT]] -define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, +define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #1 { %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 %r0 = load float, float addrspace(1)* %in1, align 4 @@ -54,7 +54,7 @@ ; GCN: v_add_f64 ; GCN: buffer_store_dwordx2 ; GCN: s_endpgm -define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 { %r0 = load double, double addrspace(1)* %in1, align 8 %r1 = load double, double addrspace(1)* %in2, align 8 @@ -70,7 +70,7 @@ ; CI: v_trunc_f64_e32 ; GCN: v_fma_f64 ; GCN: s_endpgm -define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #1 { %r0 = load double, double addrspace(1)* %in1, align 8 %r1 = load double, double addrspace(1)* %in2, align 8 @@ -79,7 +79,7 @@ ret void } -define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, +define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, <2 x float> addrspace(1)* %in2) #0 { %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8 @@ -89,7 +89,7 @@ ret void } -define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, +define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, <4 x float> addrspace(1)* %in2) #0 { %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16 @@ -99,7 +99,7 @@ ret void } -define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, +define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, <2 x double> addrspace(1)* %in2) #0 { %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4 %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16 Index: test/CodeGen/AMDGPU/fsqrt.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fsqrt.f64.ll +++ test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}v_safe_fsqrt_f64: ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 { %r0 = load double, double addrspace(1)* %in %r1 = call double @llvm.sqrt.f64(double %r0) store double %r1, double addrspace(1)* %out @@ -12,7 +12,7 @@ ; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f64: ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 { +define amdgpu_kernel void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 { %r0 = load double, double addrspace(1)* %in %r1 = call double @llvm.sqrt.f64(double %r0) store double %r1, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/fsqrt.ll =================================================================== --- test/CodeGen/AMDGPU/fsqrt.ll +++ test/CodeGen/AMDGPU/fsqrt.ll @@ -7,7 +7,7 @@ ; FUNC-LABEL: {{^}}v_safe_fsqrt_f32: ; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}} -define void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 { %r0 = load float, float addrspace(1)* %in %r1 = call float @llvm.sqrt.f32(float %r0) store float %r1, float addrspace(1)* %out @@ -16,7 +16,7 @@ ; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f32: ; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}} -define void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 { +define amdgpu_kernel void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 { %r0 = load float, float addrspace(1)* %in %r1 = call float @llvm.sqrt.f32(float %r0) store float %r1, float addrspace(1)* %out @@ -29,7 +29,7 @@ ; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z ; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS -define void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 { +define amdgpu_kernel void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 { entry: %fdiv = call float @llvm.sqrt.f32(float %in) store float %fdiv, float addrspace(1)* %out @@ -44,7 +44,7 @@ ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS -define void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { +define amdgpu_kernel void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { entry: %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) store <2 x float> %fdiv, <2 x float> addrspace(1)* %out @@ -65,7 +65,7 @@ ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS -define void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { +define amdgpu_kernel void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { entry: %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) store <4 x float> %fdiv, <4 x float> addrspace(1)* %out @@ -75,7 +75,7 @@ ; FUNC-LABEL: {{^}}elim_redun_check_neg0: ; GCN: v_sqrt_f32_e32 ; GCN-NOT: v_cndmask -define void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 { +define amdgpu_kernel void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 { entry: %sqrt = call float @llvm.sqrt.f32(float %in) %cmp = fcmp olt float %in, -0.000000e+00 @@ -87,7 +87,7 @@ ; FUNC-LABEL: {{^}}elim_redun_check_pos0: ; GCN: v_sqrt_f32_e32 ; GCN-NOT: v_cndmask -define void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 { +define amdgpu_kernel void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 { entry: %sqrt = call float @llvm.sqrt.f32(float %in) %cmp = fcmp olt float %in, 0.000000e+00 @@ -99,7 +99,7 @@ ; FUNC-LABEL: {{^}}elim_redun_check_ult: ; GCN: v_sqrt_f32_e32 ; GCN-NOT: v_cndmask -define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 { +define amdgpu_kernel void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 { entry: %sqrt = call float @llvm.sqrt.f32(float %in) %cmp = fcmp ult float %in, -0.000000e+00 @@ -112,7 +112,7 @@ ; GCN: v_sqrt_f32_e32 ; GCN: v_sqrt_f32_e32 ; GCN-NOT: v_cndmask -define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { +define amdgpu_kernel void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { entry: %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) %cmp = fcmp olt <2 x float> %in, @@ -125,7 +125,7 @@ ; GCN: v_sqrt_f32_e32 ; GCN: v_sqrt_f32_e32 ; GCN-NOT: v_cndmask -define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { +define amdgpu_kernel void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { entry: %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) %cmp = fcmp ult <2 x float> %in, Index: test/CodeGen/AMDGPU/fsub.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fsub.f16.ll +++ test/CodeGen/AMDGPU/fsub.f16.ll @@ -12,7 +12,7 @@ ; GFX89: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fsub_f16( +define amdgpu_kernel void @fsub_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -32,7 +32,7 @@ ; GFX89: v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fsub_f16_imm_a( +define amdgpu_kernel void @fsub_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -50,7 +50,7 @@ ; GFX89: v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fsub_f16_imm_b( +define amdgpu_kernel void @fsub_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -89,7 +89,7 @@ ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fsub_v2f16( +define amdgpu_kernel void @fsub_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -123,7 +123,7 @@ ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fsub_v2f16_imm_a( +define amdgpu_kernel void @fsub_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: @@ -154,7 +154,7 @@ ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fsub_v2f16_imm_b( +define amdgpu_kernel void @fsub_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/fsub.ll =================================================================== --- test/CodeGen/AMDGPU/fsub.ll +++ test/CodeGen/AMDGPU/fsub.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}v_fsub_f32: ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %a = load float, float addrspace(1)* %in, align 4 %b = load float, float addrspace(1)* %b_ptr, align 4 @@ -17,7 +17,7 @@ ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W ; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) { +define amdgpu_kernel void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) { %sub = fsub float %a, %b store float %sub, float addrspace(1)* %out, align 4 ret void @@ -29,7 +29,7 @@ ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { %sub = fsub <2 x float> %a, %b store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8 ret void @@ -45,7 +45,7 @@ ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +define amdgpu_kernel void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16 @@ -60,7 +60,7 @@ ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; SI: s_endpgm -define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) { +define amdgpu_kernel void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) { %result = fsub <4 x float> %a, %b store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 ret void @@ -69,7 +69,7 @@ ; FUNC-LABEL: {{^}}v_fneg_fsub_f32: ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]] -define void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %a = load float, float addrspace(1)* %in, align 4 %b = load float, float addrspace(1)* %b_ptr, align 4 @@ -82,7 +82,7 @@ ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_f32: ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI-NOT: xor -define void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %a = load float, float addrspace(1)* %in, align 4 %b = load float, float addrspace(1)* %b_ptr, align 4 @@ -95,7 +95,7 @@ ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_attribute_f32: ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI-NOT: xor -define void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %a = load float, float addrspace(1)* %in, align 4 %b = load float, float addrspace(1)* %b_ptr, align 4 @@ -111,7 +111,7 @@ ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_false_attribute_f32: ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]] -define void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %a = load float, float addrspace(1)* %in, align 4 %b = load float, float addrspace(1)* %b_ptr, align 4 @@ -123,7 +123,7 @@ ; FUNC-LABEL: {{^}}v_fsub_0_nsz_attribute_f32: ; SI-NOT: v_sub -define void @v_fsub_0_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_fsub_0_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %a = load float, float addrspace(1)* %in, align 4 %result = fsub float %a, 0.0 store float %result, float addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/fsub64.ll =================================================================== --- test/CodeGen/AMDGPU/fsub64.ll +++ test/CodeGen/AMDGPU/fsub64.ll @@ -5,7 +5,7 @@ ; SI-LABEL: {{^}}fsub_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -16,7 +16,7 @@ ; SI-LABEL: {{^}}fsub_fabs_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}} -define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -28,7 +28,7 @@ ; SI-LABEL: {{^}}fsub_fabs_inv_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}} -define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -40,7 +40,7 @@ ; SI-LABEL: {{^}}s_fsub_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}} -define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) { +define amdgpu_kernel void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) { %sub = fsub double %a, %b store double %sub, double addrspace(1)* %out ret void @@ -48,7 +48,7 @@ ; SI-LABEL: {{^}}s_fsub_imm_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}, 4.0 -define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) { +define amdgpu_kernel void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) { %sub = fsub double 4.0, %a store double %sub, double addrspace(1)* %out ret void @@ -56,7 +56,7 @@ ; SI-LABEL: {{^}}s_fsub_imm_inv_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}, -4.0 -define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) { +define amdgpu_kernel void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) { %sub = fsub double %a, 4.0 store double %sub, double addrspace(1)* %out ret void @@ -64,7 +64,7 @@ ; SI-LABEL: {{^}}s_fsub_self_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}} -define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) { +define amdgpu_kernel void @s_fsub_self_f64(double addrspace(1)* %out, double %a) { %sub = fsub double %a, %a store double %sub, double addrspace(1)* %out ret void @@ -73,7 +73,7 @@ ; SI-LABEL: {{^}}fsub_v2f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) { +define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) { %sub = fsub <2 x double> %a, %b store <2 x double> %sub, <2 x double> addrspace(1)* %out ret void @@ -84,7 +84,7 @@ ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) { +define amdgpu_kernel void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) { %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1 %a = load <4 x double>, <4 x double> addrspace(1)* %in %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr @@ -98,7 +98,7 @@ ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) { +define amdgpu_kernel void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) { %result = fsub <4 x double> %a, %b store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16 ret void Index: test/CodeGen/AMDGPU/ftrunc.f64.ll =================================================================== --- test/CodeGen/AMDGPU/ftrunc.f64.ll +++ test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -13,7 +13,7 @@ ; CI: v_trunc_f64 ; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11 ; SI: s_endpgm -define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) { %x = load double, double addrspace(1)* %in, align 8 %y = call double @llvm.trunc.f64(double %x) nounwind readnone store double %y, double addrspace(1)* %out, align 8 @@ -36,7 +36,7 @@ ; SI-DAG: cndmask_b32 ; SI-DAG: cndmask_b32 ; SI: s_endpgm -define void @ftrunc_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @ftrunc_f64(double addrspace(1)* %out, double %x) { %y = call double @llvm.trunc.f64(double %x) nounwind readnone store double %y, double addrspace(1)* %out ret void @@ -45,7 +45,7 @@ ; FUNC-LABEL: {{^}}ftrunc_v2f64: ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 -define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { +define amdgpu_kernel void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone store <2 x double> %y, <2 x double> addrspace(1)* %out ret void @@ -55,7 +55,7 @@ ; FIXME-CI: v_trunc_f64_e32 ; FIXME-CI: v_trunc_f64_e32 ; FIXME-CI: v_trunc_f64_e32 -; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { +; define amdgpu_kernel void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { ; %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone ; store <3 x double> %y, <3 x double> addrspace(1)* %out ; ret void @@ -66,7 +66,7 @@ ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 -define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { +define amdgpu_kernel void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone store <4 x double> %y, <4 x double> addrspace(1)* %out ret void @@ -81,7 +81,7 @@ ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 -define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { +define amdgpu_kernel void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone store <8 x double> %y, <8 x double> addrspace(1)* %out ret void @@ -104,7 +104,7 @@ ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 -define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { +define amdgpu_kernel void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone store <16 x double> %y, <16 x double> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/ftrunc.ll =================================================================== --- test/CodeGen/AMDGPU/ftrunc.ll +++ test/CodeGen/AMDGPU/ftrunc.ll @@ -12,7 +12,7 @@ ; FUNC-LABEL: {{^}}ftrunc_f32: ; EG: TRUNC ; SI: v_trunc_f32_e32 -define void @ftrunc_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @ftrunc_f32(float addrspace(1)* %out, float %x) { %y = call float @llvm.trunc.f32(float %x) nounwind readnone store float %y, float addrspace(1)* %out ret void @@ -23,7 +23,7 @@ ; EG: TRUNC ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 -define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { +define amdgpu_kernel void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone store <2 x float> %y, <2 x float> addrspace(1)* %out ret void @@ -36,7 +36,7 @@ ; FIXME-SI: v_trunc_f32_e32 ; FIXME-SI: v_trunc_f32_e32 ; FIXME-SI: v_trunc_f32_e32 -; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { +; define amdgpu_kernel void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { ; %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone ; store <3 x float> %y, <3 x float> addrspace(1)* %out ; ret void @@ -51,7 +51,7 @@ ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 -define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { +define amdgpu_kernel void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone store <4 x float> %y, <4 x float> addrspace(1)* %out ret void @@ -74,7 +74,7 @@ ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 -define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { +define amdgpu_kernel void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone store <8 x float> %y, <8 x float> addrspace(1)* %out ret void @@ -113,7 +113,7 @@ ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 -define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { +define amdgpu_kernel void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone store <16 x float> %y, <16 x float> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/gep-address-space.ll =================================================================== --- test/CodeGen/AMDGPU/gep-address-space.ll +++ test/CodeGen/AMDGPU/gep-address-space.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s -define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind { +define amdgpu_kernel void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind { ; CHECK-LABEL: {{^}}use_gep_address_space: ; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}} ; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64 @@ -17,7 +17,7 @@ ; SI: s_or_b32 ; CI: s_add_i32 ; CHECK: ds_write_b32 -define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind { +define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind { %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384 store i32 99, i32 addrspace(3)* %p ret void @@ -39,7 +39,7 @@ ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 ; CHECK: s_endpgm -define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind { +define amdgpu_kernel void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind { %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0 %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1 @@ -60,7 +60,7 @@ ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 ; CHECK: s_endpgm -define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind { +define amdgpu_kernel void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind { %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0 %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1 Index: test/CodeGen/AMDGPU/global-constant.ll =================================================================== --- test/CodeGen/AMDGPU/global-constant.ll +++ test/CodeGen/AMDGPU/global-constant.ll @@ -26,7 +26,7 @@ ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], private2@rel32@lo+4 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+4 -define void @private_test(i32 %index, float addrspace(1)* %out) { +define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) { %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index %val = load float, float addrspace(2)* %ptr store float %val, float addrspace(1)* %out @@ -40,7 +40,7 @@ ; HSA: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}} ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally@gotpcrel32@lo+4 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally@gotpcrel32@hi+4 -define void @available_externally_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(2)* @available_externally, i32 0, i32 1 %val = load i32, i32 addrspace(2)* %ptr store i32 %val, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/global-directive.ll =================================================================== --- test/CodeGen/AMDGPU/global-directive.ll +++ test/CodeGen/AMDGPU/global-directive.ll @@ -5,7 +5,7 @@ ; SI: .globl foo ; SI: {{^}}foo: -define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr Index: test/CodeGen/AMDGPU/global-extload-i16.ll =================================================================== --- test/CodeGen/AMDGPU/global-extload-i16.ll +++ test/CodeGen/AMDGPU/global-extload-i16.ll @@ -7,7 +7,7 @@ ; SI: buffer_load_ushort ; SI: buffer_store_dword ; SI: s_endpgm -define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { %a = load i16, i16 addrspace(1)* %in %ext = zext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -18,7 +18,7 @@ ; SI: buffer_load_sshort ; SI: buffer_store_dword ; SI: s_endpgm -define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { %a = load i16, i16 addrspace(1)* %in %ext = sext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -28,7 +28,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32: ; SI: buffer_load_ushort ; SI: s_endpgm -define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = zext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -38,7 +38,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32: ; SI: buffer_load_sshort ; SI: s_endpgm -define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = sext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -47,7 +47,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32: ; SI: s_endpgm -define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = zext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -56,7 +56,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32: ; SI: s_endpgm -define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = sext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -65,7 +65,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32: ; SI: s_endpgm -define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -74,7 +74,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32: ; SI: s_endpgm -define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = sext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -83,7 +83,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32: ; SI: s_endpgm -define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = zext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -92,7 +92,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32: ; SI: s_endpgm -define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = sext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -101,7 +101,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32: ; SI: s_endpgm -define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = zext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -110,7 +110,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32: ; SI: s_endpgm -define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = sext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -119,7 +119,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32: ; SI: s_endpgm -define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = zext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -128,7 +128,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32: ; SI: s_endpgm -define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = sext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -137,7 +137,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32: ; SI: s_endpgm -define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = zext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -146,7 +146,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32: ; SI: s_endpgm -define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = sext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -157,7 +157,7 @@ ; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]], ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] -define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { %a = load i16, i16 addrspace(1)* %in %ext = zext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -168,7 +168,7 @@ ; VI: buffer_load_ushort [[LOAD:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 ; VI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] ; VI: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 -define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { %a = load i16, i16 addrspace(1)* %in %ext = sext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -177,7 +177,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64: ; SI: s_endpgm -define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = zext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -186,7 +186,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64: ; SI: s_endpgm -define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = sext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -195,7 +195,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64: ; SI: s_endpgm -define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = zext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -204,7 +204,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64: ; SI: s_endpgm -define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = sext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -213,7 +213,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64: ; SI: s_endpgm -define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = zext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -222,7 +222,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64: ; SI: s_endpgm -define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = sext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -231,7 +231,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64: ; SI: s_endpgm -define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = zext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -240,7 +240,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64: ; SI: s_endpgm -define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = sext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -249,7 +249,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64: ; SI: s_endpgm -define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = zext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -258,7 +258,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64: ; SI: s_endpgm -define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = sext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -267,7 +267,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64: ; SI: s_endpgm -define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = zext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -276,7 +276,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64: ; SI: s_endpgm -define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = sext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -285,7 +285,7 @@ ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64: ; SI: s_endpgm -define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = zext <64 x i16> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -294,7 +294,7 @@ ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64: ; SI: s_endpgm -define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = sext <64 x i16> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out Index: test/CodeGen/AMDGPU/global-variable-relocs.ll =================================================================== --- test/CodeGen/AMDGPU/global-variable-relocs.ll +++ test/CodeGen/AMDGPU/global-variable-relocs.ll @@ -19,7 +19,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @private_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @private_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @private, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -33,7 +33,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @internal_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @internal_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @internal, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -50,7 +50,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @available_externally_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @available_externally, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -67,7 +67,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @linkonce_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @linkonce_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -84,7 +84,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @weak_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @weak_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -101,7 +101,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @common_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @common_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @common, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -118,7 +118,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @extern_weak_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @extern_weak_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @extern_weak, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -135,7 +135,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @linkonce_odr_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @linkonce_odr_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce_odr, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -152,7 +152,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @weak_odr_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @weak_odr_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak_odr, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -169,7 +169,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @external_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @external_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -186,7 +186,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @external_w_init_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @external_w_init_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external_w_init, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/global_atomics.ll =================================================================== --- test/CodeGen/AMDGPU/global_atomics.ll +++ test/CodeGen/AMDGPU/global_atomics.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_offset: ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst @@ -13,7 +13,7 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_soffset: ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x8ca0 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}} -define void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst @@ -25,7 +25,7 @@ ; SI-DAG: v_mov_b32_e32 v[[PTRHI:[0-9]+]], 0xabcd ; SI: buffer_atomic_add v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_add -define void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595 @@ -36,7 +36,7 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset: ; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst @@ -47,7 +47,7 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset: ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -59,7 +59,7 @@ ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -70,7 +70,7 @@ ; FUNC-LABEL: {{^}}atomic_add_i32: ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -79,7 +79,7 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_ret: ; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -89,7 +89,7 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_addr64: ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -100,7 +100,7 @@ ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -110,7 +110,7 @@ ; FUNC-LABEL: {{^}}atomic_and_i32_offset: ; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst @@ -120,7 +120,7 @@ ; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset: ; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst @@ -131,7 +131,7 @@ ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset: ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -143,7 +143,7 @@ ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -154,7 +154,7 @@ ; FUNC-LABEL: {{^}}atomic_and_i32: ; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -163,7 +163,7 @@ ; FUNC-LABEL: {{^}}atomic_and_i32_ret: ; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -173,7 +173,7 @@ ; FUNC-LABEL: {{^}}atomic_and_i32_addr64: ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -184,7 +184,7 @@ ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -194,7 +194,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32_offset: ; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst @@ -204,7 +204,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset: ; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst @@ -215,7 +215,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset: ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -227,7 +227,7 @@ ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -238,7 +238,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32: ; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -247,7 +247,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32_ret: ; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -257,7 +257,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64: ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -268,7 +268,7 @@ ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -278,7 +278,7 @@ ; FUNC-LABEL: {{^}}atomic_max_i32_offset: ; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst @@ -288,7 +288,7 @@ ; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset: ; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst @@ -299,7 +299,7 @@ ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset: ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -311,7 +311,7 @@ ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -322,7 +322,7 @@ ; FUNC-LABEL: {{^}}atomic_max_i32: ; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -331,7 +331,7 @@ ; FUNC-LABEL: {{^}}atomic_max_i32_ret: ; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -341,7 +341,7 @@ ; FUNC-LABEL: {{^}}atomic_max_i32_addr64: ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -352,7 +352,7 @@ ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -362,7 +362,7 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32_offset: ; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst @@ -372,7 +372,7 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset: ; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst @@ -383,7 +383,7 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset: ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -395,7 +395,7 @@ ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -406,7 +406,7 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32: ; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -415,7 +415,7 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32_ret: ; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -425,7 +425,7 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64: ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -436,7 +436,7 @@ ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -446,7 +446,7 @@ ; FUNC-LABEL: {{^}}atomic_min_i32_offset: ; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst @@ -456,7 +456,7 @@ ; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset: ; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst @@ -467,7 +467,7 @@ ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset: ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -479,7 +479,7 @@ ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -490,7 +490,7 @@ ; FUNC-LABEL: {{^}}atomic_min_i32: ; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -499,7 +499,7 @@ ; FUNC-LABEL: {{^}}atomic_min_i32_ret: ; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -509,7 +509,7 @@ ; FUNC-LABEL: {{^}}atomic_min_i32_addr64: ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -520,7 +520,7 @@ ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -530,7 +530,7 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32_offset: ; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst @@ -540,7 +540,7 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset: ; GCN: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst @@ -551,7 +551,7 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset: ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -563,7 +563,7 @@ ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -574,7 +574,7 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32: ; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -583,7 +583,7 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32_ret: ; SI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -593,7 +593,7 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64: ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -604,7 +604,7 @@ ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -614,7 +614,7 @@ ; FUNC-LABEL: {{^}}atomic_or_i32_offset: ; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst @@ -624,7 +624,7 @@ ; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset: ; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst @@ -635,7 +635,7 @@ ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset: ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -647,7 +647,7 @@ ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -658,7 +658,7 @@ ; FUNC-LABEL: {{^}}atomic_or_i32: ; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -667,7 +667,7 @@ ; FUNC-LABEL: {{^}}atomic_or_i32_ret: ; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -677,7 +677,7 @@ ; FUNC-LABEL: {{^}}atomic_or_i32_addr64: ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -688,7 +688,7 @@ ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -698,7 +698,7 @@ ; FUNC-LABEL: {{^}}atomic_xchg_i32_offset: ; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst @@ -708,7 +708,7 @@ ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset: ; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst @@ -720,7 +720,7 @@ ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -733,7 +733,7 @@ ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -744,7 +744,7 @@ ; FUNC-LABEL: {{^}}atomic_xchg_i32: ; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -753,7 +753,7 @@ ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret: ; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -763,7 +763,7 @@ ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64: ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -774,7 +774,7 @@ ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -784,7 +784,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset: ; GCN: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst @@ -794,7 +794,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset: ; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword v[[RET]] -define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst @@ -807,7 +807,7 @@ ; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -819,7 +819,7 @@ ; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dword v[[RET]] -define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -831,7 +831,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32: ; GCN: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst ret void @@ -840,7 +840,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret: ; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword v[[RET]] -define void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { entry: %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst %extract0 = extractvalue { i32, i1 } %val, 0 @@ -851,7 +851,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64: ; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst @@ -862,7 +862,7 @@ ; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dword v[[RET]] -define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst @@ -873,7 +873,7 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32_offset: ; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst @@ -883,7 +883,7 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset: ; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst @@ -894,7 +894,7 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset: ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -906,7 +906,7 @@ ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -917,7 +917,7 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32: ; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -926,7 +926,7 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32_ret: ; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -936,7 +936,7 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64: ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -947,7 +947,7 @@ ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -959,7 +959,7 @@ ; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %gep = getelementptr i32, i32 addrspace(1)* %in, i64 4 %val = load atomic i32, i32 addrspace(1)* %gep seq_cst, align 4 @@ -971,7 +971,7 @@ ; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4 store i32 %val, i32 addrspace(1)* %out @@ -982,7 +982,7 @@ ; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -995,7 +995,7 @@ ; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index %val = load atomic i32, i32 addrspace(1)* %ptr seq_cst, align 4 @@ -1006,7 +1006,7 @@ ; FUNC-LABEL: {{^}}atomic_store_i32_offset: ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 store atomic i32 %in, i32 addrspace(1)* %gep seq_cst, align 4 @@ -1016,7 +1016,7 @@ ; FUNC-LABEL: {{^}}atomic_store_i32: ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4 ret void @@ -1025,7 +1025,7 @@ ; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset: ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -1036,7 +1036,7 @@ ; FUNC-LABEL: {{^}}atomic_store_i32_addr64: ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4 Index: test/CodeGen/AMDGPU/global_atomics_i64.ll =================================================================== --- test/CodeGen/AMDGPU/global_atomics_i64.ll +++ test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_offset: ; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset: ; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst @@ -24,7 +24,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset: ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -define void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -36,7 +36,7 @@ ; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -47,7 +47,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64: ; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -56,7 +56,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_ret: ; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -66,7 +66,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_addr64: ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -77,7 +77,7 @@ ; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -87,7 +87,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_offset: ; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst @@ -97,7 +97,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset: ; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst @@ -108,7 +108,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset: ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -120,7 +120,7 @@ ; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -131,7 +131,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64: ; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -140,7 +140,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_ret: ; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -150,7 +150,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_addr64: ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -161,7 +161,7 @@ ; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -171,7 +171,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_offset: ; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst @@ -181,7 +181,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset: ; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst @@ -192,7 +192,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset: ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -204,7 +204,7 @@ ; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -215,7 +215,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64: ; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -224,7 +224,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_ret: ; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -234,7 +234,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_addr64: ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -245,7 +245,7 @@ ; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -255,7 +255,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_offset: ; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst @@ -265,7 +265,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset: ; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst @@ -276,7 +276,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset: ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -288,7 +288,7 @@ ; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -299,7 +299,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64: ; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -308,7 +308,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_ret: ; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -318,7 +318,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_addr64: ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -329,7 +329,7 @@ ; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -339,7 +339,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_offset: ; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst @@ -349,7 +349,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset: ; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst @@ -360,7 +360,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset: ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -372,7 +372,7 @@ ; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -383,7 +383,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64: ; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -392,7 +392,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_ret: ; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -402,7 +402,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_addr64: ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -413,7 +413,7 @@ ; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -423,7 +423,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_offset: ; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst @@ -433,7 +433,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset: ; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst @@ -444,7 +444,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset: ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -456,7 +456,7 @@ ; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -467,7 +467,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64: ; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -476,7 +476,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_ret: ; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -486,7 +486,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_addr64: ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -497,7 +497,7 @@ ; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -507,7 +507,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_offset: ; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst @@ -517,7 +517,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset: ; GCN: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst @@ -528,7 +528,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset: ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -540,7 +540,7 @@ ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -551,7 +551,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64: ; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -560,7 +560,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_ret: ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -570,7 +570,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_addr64: ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -581,7 +581,7 @@ ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -591,7 +591,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_offset: ; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst @@ -601,7 +601,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset: ; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst @@ -612,7 +612,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset: ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -624,7 +624,7 @@ ; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -635,7 +635,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64: ; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -644,7 +644,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_ret: ; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -654,7 +654,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_addr64: ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -665,7 +665,7 @@ ; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -675,7 +675,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_offset: ; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst @@ -685,7 +685,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset: ; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst @@ -696,7 +696,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset: ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -define void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -708,7 +708,7 @@ ; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -719,7 +719,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64: ; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -728,7 +728,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_ret: ; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -738,7 +738,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64: ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -749,7 +749,7 @@ ; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -759,7 +759,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_offset: ; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst @@ -769,7 +769,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset: ; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst @@ -780,7 +780,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset: ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -792,7 +792,7 @@ ; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -803,7 +803,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64: ; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -812,7 +812,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_ret: ; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -822,7 +822,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_addr64: ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -833,7 +833,7 @@ ; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -851,7 +851,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_offset: ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst @@ -861,7 +861,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_soffset: ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x11940 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}} -define void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 9000 %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst @@ -871,7 +871,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset: ; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst @@ -884,7 +884,7 @@ ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -896,7 +896,7 @@ ; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -908,7 +908,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64: ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) { entry: %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst ret void @@ -917,7 +917,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret: ; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) { entry: %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst %extract0 = extractvalue { i64, i1 } %val, 0 @@ -928,7 +928,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_addr64: ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst @@ -939,7 +939,7 @@ ; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst @@ -952,7 +952,7 @@ ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { entry: %gep = getelementptr i64, i64 addrspace(1)* %in, i64 4 %val = load atomic i64, i64 addrspace(1)* %gep seq_cst, align 8 @@ -964,7 +964,7 @@ ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { entry: %val = load atomic i64, i64 addrspace(1)* %in seq_cst, align 8 store i64 %val, i64 addrspace(1)* %out @@ -975,7 +975,7 @@ ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -988,7 +988,7 @@ ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index %val = load atomic i64, i64 addrspace(1)* %ptr seq_cst, align 8 @@ -999,7 +999,7 @@ ; FUNC-LABEL: {{^}}atomic_store_i64_offset: ; CI: buffer_store_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; VI: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -define void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 store atomic i64 %in, i64 addrspace(1)* %gep seq_cst, align 8 @@ -1009,7 +1009,7 @@ ; FUNC-LABEL: {{^}}atomic_store_i64: ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc -define void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) { entry: store atomic i64 %in, i64 addrspace(1)* %out seq_cst, align 8 ret void @@ -1018,7 +1018,7 @@ ; FUNC-LABEL: {{^}}atomic_store_i64_addr64_offset: ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}} -define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -1029,7 +1029,7 @@ ; FUNC-LABEL: {{^}}atomic_store_i64_addr64: ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}} -define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index store atomic i64 %in, i64 addrspace(1)* %ptr seq_cst, align 8 Index: test/CodeGen/AMDGPU/gv-const-addrspace.ll =================================================================== --- test/CodeGen/AMDGPU/gv-const-addrspace.ll +++ test/CodeGen/AMDGPU/gv-const-addrspace.ll @@ -15,7 +15,7 @@ ; EG: @float_gv ; EG-NOT: MOVA_INT ; EG-NOT: MOV -define void @float(float addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) { entry: %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index %1 = load float, float addrspace(2)* %0 @@ -33,7 +33,7 @@ ; EG: @i32_gv ; EG-NOT: MOVA_INT ; EG-NOT: MOV -define void @i32(i32 addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @i32(i32 addrspace(1)* %out, i32 %index) { entry: %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index %1 = load i32, i32 addrspace(2)* %0 @@ -53,7 +53,7 @@ ; EG: @struct_foo_gv ; EG-NOT: MOVA_INT ; EG-NOT: MOV -define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) { %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index %load = load i32, i32 addrspace(2)* %gep, align 4 store i32 %load, i32 addrspace(1)* %out, align 4 @@ -72,7 +72,7 @@ ; EG: @array_v1_gv ; EG-NOT: MOVA_INT ; EG-NOT: MOV -define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) { %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4 store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4 @@ -84,7 +84,7 @@ ; EG: VTX_READ_32 ; EG: @float_gv ; EG-NOT: MOVA_INT -define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) { +define amdgpu_kernel void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) { entry: %0 = icmp eq i32 0, %a br i1 %0, label %if, label %else Index: test/CodeGen/AMDGPU/gv-offset-folding.ll =================================================================== --- test/CodeGen/AMDGPU/gv-offset-folding.ll +++ test/CodeGen/AMDGPU/gv-offset-folding.ll @@ -13,7 +13,7 @@ ; CHECK-LABEL: lds_no_offset: ; CHECK: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:4 -define void @lds_no_offset() { +define amdgpu_kernel void @lds_no_offset() { entry: %ptr = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 1 store i32 0, i32 addrspace(3)* %ptr Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -8,7 +8,7 @@ ; SI: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] ; VI: v_trunc_f16_e32 [[CVT:v[0-9]+]], [[ARG]] ; GCN: buffer_store_short [[CVT]] -define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { +define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { store half %arg, half addrspace(1)* %out ret void } @@ -20,7 +20,7 @@ ; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]] ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: s_endpgm -define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { +define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { store <2 x half> %arg, <2 x half> addrspace(1)* %out ret void } @@ -34,7 +34,7 @@ ; GCN-DAG: buffer_store_short ; GCN-NOT: buffer_store ; GCN: s_endpgm -define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { +define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { store <3 x half> %arg, <3 x half> addrspace(1)* %out ret void } @@ -46,33 +46,33 @@ ; GCN: buffer_load_ushort ; GCN: buffer_store_dwordx2 ; GCN: s_endpgm -define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { +define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { store <4 x half> %arg, <4 x half> addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}load_v8f16_arg: -define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { +define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { store <8 x half> %arg, <8 x half> addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}extload_v2f16_arg: -define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { +define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { %fpext = fpext <2 x half> %in to <2 x float> store <2 x float> %fpext, <2 x float> addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}extload_f16_to_f32_arg: -define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { +define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { %ext = fpext half %arg to float store float %ext, float addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: -define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { +define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { %ext = fpext <2 x half> %arg to <2 x float> store <2 x float> %ext, <2 x float> addrspace(1)* %out ret void @@ -90,14 +90,14 @@ ; GCN-DAG: buffer_store_dword ; GCN-DAG: buffer_store_dwordx2 ; GCN: s_endpgm -define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { +define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { %ext = fpext <3 x half> %arg to <3 x float> store <3 x float> %ext, <3 x float> addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: -define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { +define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { %ext = fpext <4 x half> %arg to <4 x float> store <4 x float> %ext, <4 x float> addrspace(1)* %out ret void @@ -124,7 +124,7 @@ ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { +define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { %ext = fpext <8 x half> %arg to <8 x float> store <8 x float> %ext, <8 x float> addrspace(1)* %out ret void @@ -138,7 +138,7 @@ ; VI: v_cvt_f32_f16_e32 v[[VARG_F32:[0-9]+]], v[[VARG]] ; VI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[VARG_F32]] ; GCN: buffer_store_dwordx2 [[RESULT]] -define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { +define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { %ext = fpext half %arg to double store double %ext, double addrspace(1)* %out ret void @@ -152,7 +152,7 @@ ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN: s_endpgm -define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { +define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { %ext = fpext <2 x half> %arg to <2 x double> store <2 x double> %ext, <2 x double> addrspace(1)* %out ret void @@ -169,7 +169,7 @@ ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN: s_endpgm -define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { +define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { %ext = fpext <3 x half> %arg to <3 x double> store <3 x double> %ext, <3 x double> addrspace(1)* %out ret void @@ -189,7 +189,7 @@ ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN: s_endpgm -define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { +define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { %ext = fpext <4 x half> %arg to <4 x double> store <4 x double> %ext, <4 x double> addrspace(1)* %out ret void @@ -227,7 +227,7 @@ ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN: s_endpgm -define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { +define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { %ext = fpext <8 x half> %arg to <8 x double> store <8 x double> %ext, <8 x double> addrspace(1)* %out ret void @@ -236,7 +236,7 @@ ; GCN-LABEL: {{^}}global_load_store_f16: ; GCN: buffer_load_ushort [[TMP:v[0-9]+]] ; GCN: buffer_store_short [[TMP]] -define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %val = load half, half addrspace(1)* %in store half %val, half addrspace(1)* %out ret void @@ -245,7 +245,7 @@ ; GCN-LABEL: {{^}}global_load_store_v2f16: ; GCN: buffer_load_dword [[TMP:v[0-9]+]] ; GCN: buffer_store_dword [[TMP]] -define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in store <2 x half> %val, <2 x half> addrspace(1)* %out ret void @@ -254,7 +254,7 @@ ; GCN-LABEL: {{^}}global_load_store_v4f16: ; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] ; GCN: buffer_store_dwordx2 [[TMP]] -define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { %val = load <4 x half>, <4 x half> addrspace(1)* %in store <4 x half> %val, <4 x half> addrspace(1)* %out ret void @@ -264,7 +264,7 @@ ; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] ; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] ; GCN: s_endpgm -define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { %val = load <8 x half>, <8 x half> addrspace(1)* %in store <8 x half> %val, <8 x half> addrspace(1)* %out ret void @@ -274,7 +274,7 @@ ; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] ; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] ; GCN: buffer_store_dword [[CVT]] -define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { %val = load half, half addrspace(1)* %in %cvt = fpext half %val to float store float %cvt, float addrspace(1)* %out @@ -289,7 +289,7 @@ ; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} ; GCN: s_endpgm -define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in %cvt = fpext <2 x half> %val to <2 x float> store <2 x float> %cvt, <2 x float> addrspace(1)* %out @@ -297,7 +297,7 @@ } ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: -define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { %val = load <3 x half>, <3 x half> addrspace(1)* %in %cvt = fpext <3 x half> %val to <3 x float> store <3 x float> %cvt, <3 x float> addrspace(1)* %out @@ -305,7 +305,7 @@ } ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: -define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { %val = load <4 x half>, <4 x half> addrspace(1)* %in %cvt = fpext <4 x half> %val to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out @@ -313,7 +313,7 @@ } ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: -define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { %val = load <8 x half>, <8 x half> addrspace(1)* %in %cvt = fpext <8 x half> %val to <8 x float> store <8 x float> %cvt, <8 x float> addrspace(1)* %out @@ -347,7 +347,7 @@ ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { %val = load <16 x half>, <16 x half> addrspace(1)* %in %cvt = fpext <16 x half> %val to <16 x float> store <16 x float> %cvt, <16 x float> addrspace(1)* %out @@ -359,7 +359,7 @@ ; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] ; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] ; GCN: buffer_store_dwordx2 [[CVT1]] -define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { %val = load half, half addrspace(1)* %in %cvt = fpext half %val to double store double %cvt, double addrspace(1)* %out @@ -375,7 +375,7 @@ ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} ; GCN: s_endpgm -define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in %cvt = fpext <2 x half> %val to <2 x double> store <2 x double> %cvt, <2 x double> addrspace(1)* %out @@ -413,7 +413,7 @@ ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 ; GCN: s_endpgm -define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { %val = load <3 x half>, <3 x half> addrspace(1)* %in %cvt = fpext <3 x half> %val to <3 x double> store <3 x double> %cvt, <3 x double> addrspace(1)* %out @@ -421,7 +421,7 @@ } ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: -define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { %val = load <4 x half>, <4 x half> addrspace(1)* %in %cvt = fpext <4 x half> %val to <4 x double> store <4 x double> %cvt, <4 x double> addrspace(1)* %out @@ -429,7 +429,7 @@ } ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: -define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { %val = load <8 x half>, <8 x half> addrspace(1)* %in %cvt = fpext <8 x half> %val to <8 x double> store <8 x double> %cvt, <8 x double> addrspace(1)* %out @@ -437,7 +437,7 @@ } ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: -define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { %val = load <16 x half>, <16 x half> addrspace(1)* %in %cvt = fpext <16 x half> %val to <16 x double> store <16 x double> %cvt, <16 x double> addrspace(1)* %out @@ -448,7 +448,7 @@ ; GCN: buffer_load_dword [[LOAD:v[0-9]+]] ; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] ; GCN: buffer_store_short [[CVT]] -define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { %val = load float, float addrspace(1)* %in %cvt = fptrunc float %val to half store half %cvt, half addrspace(1)* %out @@ -463,7 +463,7 @@ ; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]] ; GCN-DAG: buffer_store_dword [[PACKED]] ; GCN: s_endpgm -define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { %val = load <2 x float>, <2 x float> addrspace(1)* %in %cvt = fptrunc <2 x float> %val to <2 x half> store <2 x half> %cvt, <2 x half> addrspace(1)* %out @@ -479,7 +479,7 @@ ; GCN: buffer_store_short ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { %val = load <3 x float>, <3 x float> addrspace(1)* %in %cvt = fptrunc <3 x float> %val to <3 x half> store <3 x half> %cvt, <3 x half> addrspace(1)* %out @@ -494,7 +494,7 @@ ; GCN: v_cvt_f16_f32_e32 ; GCN: buffer_store_dwordx2 ; GCN: s_endpgm -define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %val = load <4 x float>, <4 x float> addrspace(1)* %in %cvt = fptrunc <4 x float> %val to <4 x half> store <4 x half> %cvt, <4 x half> addrspace(1)* %out @@ -514,7 +514,7 @@ ; GCN: v_cvt_f16_f32_e32 ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { %val = load <8 x float>, <8 x float> addrspace(1)* %in %cvt = fptrunc <8 x float> %val to <8 x half> store <8 x half> %cvt, <8 x half> addrspace(1)* %out @@ -545,7 +545,7 @@ ; GCN-DAG: buffer_store_dwordx4 ; GCN-DAG: buffer_store_dwordx4 ; GCN: s_endpgm -define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { %val = load <16 x float>, <16 x float> addrspace(1)* %in %cvt = fptrunc <16 x float> %val to <16 x half> store <16 x half> %cvt, <16 x half> addrspace(1)* %out @@ -560,7 +560,7 @@ ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, ; SI: v_add_f32 ; GCN: s_endpgm -define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { +define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { %add = fadd half %a, %b store half %add, half addrspace(1)* %out, align 4 ret void @@ -570,7 +570,7 @@ ; SI: v_add_f32 ; SI: v_add_f32 ; GCN: s_endpgm -define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { +define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { %add = fadd <2 x half> %a, %b store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 ret void @@ -582,7 +582,7 @@ ; SI: v_add_f32 ; SI: v_add_f32 ; GCN: s_endpgm -define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 @@ -601,7 +601,7 @@ ; SI: v_add_f32 ; SI: v_add_f32 ; GCN: s_endpgm -define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { +define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { %add = fadd <8 x half> %a, %b store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 ret void @@ -610,7 +610,7 @@ ; GCN-LABEL: {{^}}test_bitcast_from_half: ; GCN: buffer_load_ushort [[TMP:v[0-9]+]] ; GCN: buffer_store_short [[TMP]] -define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { %val = load half, half addrspace(1)* %in %val_int = bitcast half %val to i16 store i16 %val_int, i16 addrspace(1)* %out @@ -620,7 +620,7 @@ ; GCN-LABEL: {{^}}test_bitcast_to_half: ; GCN: buffer_load_ushort [[TMP:v[0-9]+]] ; GCN: buffer_store_short [[TMP]] -define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %val = load i16, i16 addrspace(1)* %in %val_fp = bitcast i16 %val to half store half %val_fp, half addrspace(1)* %out Index: test/CodeGen/AMDGPU/hsa-default-device.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-default-device.ll +++ test/CodeGen/AMDGPU/hsa-default-device.ll @@ -4,7 +4,7 @@ ; unsupported device. ; CHECK: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU" -define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind { +define amdgpu_kernel void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind { store float 0.0, float addrspace(1)* %out0 ret void } Index: test/CodeGen/AMDGPU/hsa-fp-mode.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-fp-mode.ll +++ test/CodeGen/AMDGPU/hsa-fp-mode.ll @@ -4,7 +4,7 @@ ; GCN: float_mode = 192 ; GCN: enable_dx10_clamp = 1 ; GCN: enable_ieee_mode = 1 -define void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 { +define amdgpu_kernel void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -14,7 +14,7 @@ ; GCN: float_mode = 192 ; GCN: enable_dx10_clamp = 1 ; GCN: enable_ieee_mode = 1 -define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 { +define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -24,7 +24,7 @@ ; GCN: float_mode = 192 ; GCN: enable_dx10_clamp = 1 ; GCN: enable_ieee_mode = 1 -define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 { +define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -34,7 +34,7 @@ ; GCN: float_mode = 48 ; GCN: enable_dx10_clamp = 1 ; GCN: enable_ieee_mode = 1 -define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 { +define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -44,7 +44,7 @@ ; GCN: float_mode = 240 ; GCN: enable_dx10_clamp = 1 ; GCN: enable_ieee_mode = 1 -define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 { +define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -54,7 +54,7 @@ ; GCN: float_mode = 0 ; GCN: enable_dx10_clamp = 1 ; GCN: enable_ieee_mode = 1 -define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 { +define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -64,7 +64,7 @@ ; GCN: float_mode = 192 ; GCN: enable_dx10_clamp = 0 ; GCN: enable_ieee_mode = 1 -define void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #6 { +define amdgpu_kernel void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #6 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void Index: test/CodeGen/AMDGPU/hsa-globals.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-globals.ll +++ test/CodeGen/AMDGPU/hsa-globals.ll @@ -9,7 +9,7 @@ @internal_readonly = internal unnamed_addr addrspace(2) constant i32 0 @external_readonly = unnamed_addr addrspace(2) constant i32 0 -define void @test() { +define amdgpu_kernel void @test() { ret void } Index: test/CodeGen/AMDGPU/hsa-group-segment.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-group-segment.ll +++ test/CodeGen/AMDGPU/hsa-group-segment.ll @@ -3,7 +3,7 @@ @internal_group = internal addrspace(3) global i32 undef @external_group = addrspace(3) global i32 undef -define void @test() { +define amdgpu_kernel void @test() { entry: store i32 0, i32 addrspace(3)* @internal_group store i32 0, i32 addrspace(3)* @external_group Index: test/CodeGen/AMDGPU/i1-copy-implicit-def.ll =================================================================== --- test/CodeGen/AMDGPU/i1-copy-implicit-def.ll +++ test/CodeGen/AMDGPU/i1-copy-implicit-def.ll @@ -5,7 +5,7 @@ ; SI-LABEL: {{^}}br_implicit_def: ; SI: BB#0: ; SI-NEXT: s_cbranch_scc1 -define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 { +define amdgpu_kernel void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 { bb: br i1 undef, label %bb1, label %bb2 Index: test/CodeGen/AMDGPU/i1-copy-phi.ll =================================================================== --- test/CodeGen/AMDGPU/i1-copy-phi.ll +++ test/CodeGen/AMDGPU/i1-copy-phi.ll @@ -10,7 +10,7 @@ ; SI: s_and_saveexec_b64 ; SI: s_xor_b64 ; SI: s_endpgm -define void @br_i1_phi(i32 %arg) { +define amdgpu_kernel void @br_i1_phi(i32 %arg) { bb: %tidig = call i32 @llvm.r600.read.tidig.x() #0 %cmp = trunc i32 %tidig to i1 Index: test/CodeGen/AMDGPU/i8-to-double-to-float.ll =================================================================== --- test/CodeGen/AMDGPU/i8-to-double-to-float.ll +++ test/CodeGen/AMDGPU/i8-to-double-to-float.ll @@ -2,7 +2,7 @@ ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) { %1 = load i8, i8 addrspace(1)* %in %2 = uitofp i8 %1 to double %3 = fptrunc double %2 to float Index: test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll =================================================================== --- test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll +++ test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll @@ -6,7 +6,7 @@ ;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;CHECK-NOT: SETNE_INT -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = load i32, i32 addrspace(1)* %in %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 Index: test/CodeGen/AMDGPU/icmp.i16.ll =================================================================== --- test/CodeGen/AMDGPU/icmp.i16.ll +++ test/CodeGen/AMDGPU/icmp.i16.ll @@ -8,7 +8,7 @@ ; GCN-LABEL: {{^}}i16_eq: ; VI: v_cmp_eq_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_eq(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_eq(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -26,7 +26,7 @@ ; GCN-LABEL: {{^}}i16_ne: ; VI: v_cmp_ne_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ne_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ne(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_ne(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -44,7 +44,7 @@ ; GCN-LABEL: {{^}}i16_ugt: ; VI: v_cmp_gt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ugt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_ugt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -62,7 +62,7 @@ ; GCN-LABEL: {{^}}i16_uge: ; VI: v_cmp_ge_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_uge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_uge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -80,7 +80,7 @@ ; GCN-LABEL: {{^}}i16_ult: ; VI: v_cmp_lt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ult(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_ult(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -98,7 +98,7 @@ ; GCN-LABEL: {{^}}i16_ule: ; VI: v_cmp_le_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ule(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_ule(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -117,7 +117,7 @@ ; GCN-LABEL: {{^}}i16_sgt: ; VI: v_cmp_gt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_sgt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_sgt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -135,7 +135,7 @@ ; GCN-LABEL: {{^}}i16_sge: ; VI: v_cmp_ge_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_sge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_sge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -153,7 +153,7 @@ ; GCN-LABEL: {{^}}i16_slt: ; VI: v_cmp_lt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_slt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_slt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -171,7 +171,7 @@ ; GCN-LABEL: {{^}}i16_sle: ; VI: v_cmp_le_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_sle(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_sle(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -190,7 +190,7 @@ ; GCN-LABEL: {{^}}i16_eq_v_s: ; VI: v_cmp_eq_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_eq_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_eq_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_eq_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -206,7 +206,7 @@ ; GCN-LABEL: {{^}}i16_ne_v_s: ; VI: v_cmp_ne_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ne_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ne_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_ne_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -222,7 +222,7 @@ ; GCN-LABEL: {{^}}i16_ugt_v_s: ; VI: v_cmp_lt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ugt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_ugt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -238,7 +238,7 @@ ; GCN-LABEL: {{^}}i16_uge_v_s: ; VI: v_cmp_le_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_uge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_uge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -254,7 +254,7 @@ ; GCN-LABEL: {{^}}i16_ult_v_s: ; VI: v_cmp_gt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ult_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_ult_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -270,7 +270,7 @@ ; GCN-LABEL: {{^}}i16_ule_v_s: ; VI: v_cmp_ge_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ule_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_ule_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -286,7 +286,7 @@ ; GCN-LABEL: {{^}}i16_sgt_v_s: ; VI: v_cmp_lt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_sgt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_sgt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -302,7 +302,7 @@ ; GCN-LABEL: {{^}}i16_sge_v_s: ; VI: v_cmp_le_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_sge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_sge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -318,7 +318,7 @@ ; GCN-LABEL: {{^}}i16_slt_v_s: ; VI: v_cmp_gt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_slt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_slt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -334,7 +334,7 @@ ; GCN-LABEL: {{^}}i16_sle_v_s: ; VI: v_cmp_ge_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_sle_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_sle_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 Index: test/CodeGen/AMDGPU/icmp64.ll =================================================================== --- test/CodeGen/AMDGPU/icmp64.ll +++ test/CodeGen/AMDGPU/icmp64.ll @@ -3,7 +3,7 @@ ; SI-LABEL: {{^}}test_i64_eq: ; SI: v_cmp_eq_u64 -define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp eq i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -12,7 +12,7 @@ ; SI-LABEL: {{^}}test_i64_ne: ; SI: v_cmp_ne_u64 -define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp ne i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -21,7 +21,7 @@ ; SI-LABEL: {{^}}test_i64_slt: ; SI: v_cmp_lt_i64 -define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp slt i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -30,7 +30,7 @@ ; SI-LABEL: {{^}}test_i64_ult: ; SI: v_cmp_lt_u64 -define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp ult i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -39,7 +39,7 @@ ; SI-LABEL: {{^}}test_i64_sle: ; SI: v_cmp_le_i64 -define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp sle i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -48,7 +48,7 @@ ; SI-LABEL: {{^}}test_i64_ule: ; SI: v_cmp_le_u64 -define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp ule i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -57,7 +57,7 @@ ; SI-LABEL: {{^}}test_i64_sgt: ; SI: v_cmp_gt_i64 -define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp sgt i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -66,7 +66,7 @@ ; SI-LABEL: {{^}}test_i64_ugt: ; SI: v_cmp_gt_u64 -define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp ugt i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -75,7 +75,7 @@ ; SI-LABEL: {{^}}test_i64_sge: ; SI: v_cmp_ge_i64 -define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp sge i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -84,7 +84,7 @@ ; SI-LABEL: {{^}}test_i64_uge: ; SI: v_cmp_ge_u64 -define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp uge i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/image-attributes.ll =================================================================== --- test/CodeGen/AMDGPU/image-attributes.ll +++ test/CodeGen/AMDGPU/image-attributes.ll @@ -7,7 +7,7 @@ ; FUNC-LABEL: {{^}}width_2d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[2].Z -define void @width_2d (%opencl.image2d_t addrspace(1)* %in, +define amdgpu_kernel void @width_2d (%opencl.image2d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d( @@ -20,7 +20,7 @@ ; FUNC-LABEL: {{^}}width_3d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[2].Z -define void @width_3d (%opencl.image3d_t addrspace(1)* %in, +define amdgpu_kernel void @width_3d (%opencl.image3d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d( @@ -37,7 +37,7 @@ ; FUNC-LABEL: {{^}}height_2d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[2].W -define void @height_2d (%opencl.image2d_t addrspace(1)* %in, +define amdgpu_kernel void @height_2d (%opencl.image2d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d( @@ -50,7 +50,7 @@ ; FUNC-LABEL: {{^}}height_3d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[2].W -define void @height_3d (%opencl.image3d_t addrspace(1)* %in, +define amdgpu_kernel void @height_3d (%opencl.image3d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d( @@ -67,7 +67,7 @@ ; FUNC-LABEL: {{^}}depth_3d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[3].X -define void @depth_3d (%opencl.image3d_t addrspace(1)* %in, +define amdgpu_kernel void @depth_3d (%opencl.image3d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d( @@ -84,7 +84,7 @@ ; FUNC-LABEL: {{^}}data_type_2d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[3].Y -define void @data_type_2d (%opencl.image2d_t addrspace(1)* %in, +define amdgpu_kernel void @data_type_2d (%opencl.image2d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d( @@ -97,7 +97,7 @@ ; FUNC-LABEL: {{^}}data_type_3d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[3].Y -define void @data_type_3d (%opencl.image3d_t addrspace(1)* %in, +define amdgpu_kernel void @data_type_3d (%opencl.image3d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d( @@ -114,7 +114,7 @@ ; FUNC-LABEL: {{^}}channel_order_2d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[3].Z -define void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in, +define amdgpu_kernel void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d( @@ -127,7 +127,7 @@ ; FUNC-LABEL: {{^}}channel_order_3d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[3].Z -define void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in, +define amdgpu_kernel void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d( @@ -146,7 +146,7 @@ ; FUNC-LABEL: {{^}}image_arg_2nd: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[4].Z -define void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1, +define amdgpu_kernel void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1, i32 %x, %opencl.image2d_t addrspace(1)* %in2, i32 addrspace(1)* %out) { Index: test/CodeGen/AMDGPU/image-resource-id.ll =================================================================== --- test/CodeGen/AMDGPU/image-resource-id.ll +++ test/CodeGen/AMDGPU/image-resource-id.ll @@ -7,7 +7,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only +define amdgpu_kernel void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( @@ -21,7 +21,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only +define amdgpu_kernel void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( @@ -37,7 +37,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only +define amdgpu_kernel void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( @@ -51,7 +51,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only +define amdgpu_kernel void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( @@ -67,7 +67,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only +define amdgpu_kernel void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only %opencl.image2d_t addrspace(1)* %in2, ; read_only i32 addrspace(1)* %out) { entry: @@ -82,7 +82,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only +define amdgpu_kernel void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only %opencl.image2d_t addrspace(1)* %in2, ; read_only i32 addrspace(1)* %out) { entry: @@ -97,7 +97,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only +define amdgpu_kernel void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only %opencl.image3d_t addrspace(1)* %in2, ; read_only i32 addrspace(1)* %out) { entry: @@ -112,7 +112,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only +define amdgpu_kernel void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only %opencl.image3d_t addrspace(1)* %in2, ; read_only i32 addrspace(1)* %out) { entry: @@ -129,7 +129,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only %opencl.image2d_t addrspace(1)* %in2, ; write_only i32 addrspace(1)* %out) { entry: @@ -144,7 +144,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only %opencl.image2d_t addrspace(1)* %in2, ; write_only i32 addrspace(1)* %out) { entry: @@ -159,7 +159,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only %opencl.image3d_t addrspace(1)* %in2, ; write_only i32 addrspace(1)* %out) { entry: @@ -174,7 +174,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only %opencl.image3d_t addrspace(1)* %in2, ; write_only i32 addrspace(1)* %out) { entry: @@ -191,7 +191,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 2( -define void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only +define amdgpu_kernel void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only %opencl.image3d_t addrspace(1)* %in2, ; read_only %opencl.image2d_t addrspace(1)* %in3, ; read_only i32 addrspace(1)* %out) { @@ -208,7 +208,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 2( -define void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only +define amdgpu_kernel void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only %opencl.image2d_t addrspace(1)* %in2, ; read_only %opencl.image3d_t addrspace(1)* %in3, ; read_only i32 addrspace(1)* %out) { @@ -226,7 +226,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 2( -define void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only %opencl.image3d_t addrspace(1)* %in2, ; write_only %opencl.image2d_t addrspace(1)* %in3, ; write_only i32 addrspace(1)* %out) { @@ -243,7 +243,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 2( -define void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only %opencl.image2d_t addrspace(1)* %in2, ; write_only %opencl.image3d_t addrspace(1)* %in3, ; write_only i32 addrspace(1)* %out) { @@ -261,7 +261,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only %opencl.image3d_t addrspace(1)* %in2, ; read_only %opencl.image2d_t addrspace(1)* %in3, ; read_only i32 addrspace(1)* %out) { @@ -277,7 +277,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only %opencl.image2d_t addrspace(1)* %in2, ; read_only %opencl.image3d_t addrspace(1)* %in3, ; read_only i32 addrspace(1)* %out) { @@ -293,7 +293,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only %opencl.image3d_t addrspace(1)* %in2, ; read_only %opencl.image2d_t addrspace(1)* %in3, ; write_only i32 addrspace(1)* %out) { @@ -309,7 +309,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only %opencl.image2d_t addrspace(1)* %in2, ; read_only %opencl.image3d_t addrspace(1)* %in3, ; write_only i32 addrspace(1)* %out) { Index: test/CodeGen/AMDGPU/imm.ll =================================================================== --- test/CodeGen/AMDGPU/imm.ll +++ test/CodeGen/AMDGPU/imm.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}i64_imm_inline_lo: ; GCN: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], 5 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]: -define void @i64_imm_inline_lo(i64 addrspace(1) *%out) { +define amdgpu_kernel void @i64_imm_inline_lo(i64 addrspace(1) *%out) { entry: store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005 ret void @@ -15,7 +15,7 @@ ; GCN-LABEL: {{^}}i64_imm_inline_hi: ; GCN: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], 5 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]] -define void @i64_imm_inline_hi(i64 addrspace(1) *%out) { +define amdgpu_kernel void @i64_imm_inline_hi(i64 addrspace(1) *%out) { entry: store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678 ret void @@ -25,7 +25,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) { +define amdgpu_kernel void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) { store i64 -9223372036854775808, i64 addrspace(1) *%out ret void } @@ -33,7 +33,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_i32: ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) { store i32 -2147483648, i32 addrspace(1)* %out ret void } @@ -41,7 +41,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_0.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_0.0_f32(float addrspace(1)* %out) { store float 0.0, float addrspace(1)* %out ret void } @@ -49,7 +49,7 @@ ; GCN-LABEL: {{^}}store_imm_neg_0.0_f32: ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_imm_neg_0.0_f32(float addrspace(1)* %out) { store float -0.0, float addrspace(1)* %out ret void } @@ -57,7 +57,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_0.5_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_0.5_f32(float addrspace(1)* %out) { store float 0.5, float addrspace(1)* %out ret void } @@ -65,7 +65,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) { store float -0.5, float addrspace(1)* %out ret void } @@ -73,7 +73,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_1.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_1.0_f32(float addrspace(1)* %out) { store float 1.0, float addrspace(1)* %out ret void } @@ -81,7 +81,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) { store float -1.0, float addrspace(1)* %out ret void } @@ -89,7 +89,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_2.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_2.0_f32(float addrspace(1)* %out) { store float 2.0, float addrspace(1)* %out ret void } @@ -97,7 +97,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) { store float -2.0, float addrspace(1)* %out ret void } @@ -105,7 +105,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_4.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_4.0_f32(float addrspace(1)* %out) { store float 4.0, float addrspace(1)* %out ret void } @@ -113,7 +113,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) { store float -4.0, float addrspace(1)* %out ret void } @@ -123,7 +123,7 @@ ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e22f983{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0.15915494{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) { store float 0x3FC45F3060000000, float addrspace(1)* %out ret void } @@ -131,7 +131,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_f32: ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbe22f983{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) { store float 0xBFC45F3060000000, float addrspace(1)* %out ret void } @@ -139,7 +139,7 @@ ; GCN-LABEL: {{^}}store_literal_imm_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000 ; GCN: buffer_store_dword [[REG]] -define void @store_literal_imm_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_literal_imm_f32(float addrspace(1)* %out) { store float 4096.0, float addrspace(1)* %out ret void } @@ -148,7 +148,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0.0 store float %y, float addrspace(1)* %out ret void @@ -158,7 +158,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0.5 store float %y, float addrspace(1)* %out ret void @@ -168,7 +168,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, -0.5 store float %y, float addrspace(1)* %out ret void @@ -178,7 +178,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 1.0 store float %y, float addrspace(1)* %out ret void @@ -188,7 +188,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, -1.0 store float %y, float addrspace(1)* %out ret void @@ -198,7 +198,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 2.0 store float %y, float addrspace(1)* %out ret void @@ -208,7 +208,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, -2.0 store float %y, float addrspace(1)* %out ret void @@ -218,7 +218,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 4.0 store float %y, float addrspace(1)* %out ret void @@ -228,7 +228,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, -4.0 store float %y, float addrspace(1)* %out ret void @@ -238,7 +238,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]] ; GCN: buffer_store_dword [[REG]] -define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %x = load float, float addrspace(1)* %in %y = fadd float %x, 0.5 store float %y, float addrspace(1)* %out @@ -249,7 +249,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]] ; GCN: buffer_store_dword [[REG]] -define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %x = load float, float addrspace(1)* %in %y = fadd float %x, 1024.0 store float %y, float addrspace(1)* %out @@ -260,7 +260,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 1{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0x36a0000000000000 store float %y, float addrspace(1)* %out ret void @@ -270,7 +270,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 2{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0x36b0000000000000 store float %y, float addrspace(1)* %out ret void @@ -280,7 +280,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 16 ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0x36e0000000000000 store float %y, float addrspace(1)* %out ret void @@ -290,7 +290,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -1{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0xffffffffe0000000 store float %y, float addrspace(1)* %out ret void @@ -300,7 +300,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -2{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0xffffffffc0000000 store float %y, float addrspace(1)* %out ret void @@ -310,7 +310,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -16 ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0xfffffffe00000000 store float %y, float addrspace(1)* %out ret void @@ -320,7 +320,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 63 ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0x36ff800000000000 store float %y, float addrspace(1)* %out ret void @@ -330,7 +330,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 64 ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0x3700000000000000 store float %y, float addrspace(1)* %out ret void @@ -342,7 +342,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}} ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0.0 store double %y, double addrspace(1)* %out ret void @@ -353,7 +353,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0.5 store double %y, double addrspace(1)* %out ret void @@ -364,7 +364,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, -0.5 store double %y, double addrspace(1)* %out ret void @@ -375,7 +375,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 1.0 store double %y, double addrspace(1)* %out ret void @@ -386,7 +386,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, -1.0 store double %y, double addrspace(1)* %out ret void @@ -397,7 +397,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 2.0 store double %y, double addrspace(1)* %out ret void @@ -408,7 +408,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, -2.0 store double %y, double addrspace(1)* %out ret void @@ -419,7 +419,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 4.0 store double %y, double addrspace(1)* %out ret void @@ -430,7 +430,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, -4.0 store double %y, double addrspace(1)* %out ret void @@ -445,7 +445,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; VI: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.15915494{{$}} ; VI: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x3fc45f306dc9c882 store double %y, double addrspace(1)* %out ret void @@ -455,7 +455,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30 ; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0xbfc45f306dc9c882 store double %y, double addrspace(1)* %out ret void @@ -466,7 +466,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}} ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x0000000000000001 store double %y, double addrspace(1)* %out ret void @@ -477,7 +477,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}} ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x0000000000000002 store double %y, double addrspace(1)* %out ret void @@ -488,7 +488,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x0000000000000010 store double %y, double addrspace(1)* %out ret void @@ -499,7 +499,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0xffffffffffffffff store double %y, double addrspace(1)* %out ret void @@ -510,7 +510,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0xfffffffffffffffe store double %y, double addrspace(1)* %out ret void @@ -521,7 +521,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -16 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0xfffffffffffffff0 store double %y, double addrspace(1)* %out ret void @@ -532,7 +532,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x000000000000003F store double %y, double addrspace(1)* %out ret void @@ -543,7 +543,7 @@ ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x0000000000000040 store double %y, double addrspace(1)* %out ret void @@ -554,7 +554,7 @@ ; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0 ; GCN: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], v[[LO_VREG]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_0.0_f64(double addrspace(1)* %out) { store double 0.0, double addrspace(1)* %out ret void } @@ -564,7 +564,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) { store double -0.0, double addrspace(1)* %out ret void } @@ -573,7 +573,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_0.5_f64(double addrspace(1)* %out) { store double 0.5, double addrspace(1)* %out ret void } @@ -582,7 +582,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) { store double -0.5, double addrspace(1)* %out ret void } @@ -591,7 +591,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_1.0_f64(double addrspace(1)* %out) { store double 1.0, double addrspace(1)* %out ret void } @@ -600,7 +600,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) { store double -1.0, double addrspace(1)* %out ret void } @@ -609,7 +609,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_2.0_f64(double addrspace(1)* %out) { store double 2.0, double addrspace(1)* %out ret void } @@ -618,7 +618,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) { store double -2.0, double addrspace(1)* %out ret void } @@ -627,7 +627,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_4.0_f64(double addrspace(1)* %out) { store double 4.0, double addrspace(1)* %out ret void } @@ -636,7 +636,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) { store double -4.0, double addrspace(1)* %out ret void } @@ -645,7 +645,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fc45f30 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inv_2pi_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inv_2pi_f64(double addrspace(1)* %out) { store double 0x3fc45f306dc9c882, double addrspace(1)* %out ret void } @@ -654,7 +654,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) { store double 0xbfc45f306dc9c882, double addrspace(1)* %out ret void } @@ -663,7 +663,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40b00000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_literal_imm_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_literal_imm_f64(double addrspace(1)* %out) { store double 4096.0, double addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/imm16.ll =================================================================== --- test/CodeGen/AMDGPU/imm16.ll +++ test/CodeGen/AMDGPU/imm16.ll @@ -7,7 +7,7 @@ ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) { store volatile i16 -32768, i16 addrspace(1)* %out ret void } @@ -15,7 +15,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_0.0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_0.0_f16(half addrspace(1)* %out) { store half 0.0, half addrspace(1)* %out ret void } @@ -24,7 +24,7 @@ ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_imm_neg_0.0_f16(half addrspace(1)* %out) { store half -0.0, half addrspace(1)* %out ret void } @@ -32,7 +32,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_0.5_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3800{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_0.5_f16(half addrspace(1)* %out) { store half 0.5, half addrspace(1)* %out ret void } @@ -41,7 +41,7 @@ ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb800{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) { store half -0.5, half addrspace(1)* %out ret void } @@ -49,7 +49,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_1.0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_1.0_f16(half addrspace(1)* %out) { store half 1.0, half addrspace(1)* %out ret void } @@ -58,7 +58,7 @@ ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) { store half -1.0, half addrspace(1)* %out ret void } @@ -66,7 +66,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_2.0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_2.0_f16(half addrspace(1)* %out) { store half 2.0, half addrspace(1)* %out ret void } @@ -75,7 +75,7 @@ ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc000{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) { store half -2.0, half addrspace(1)* %out ret void } @@ -83,7 +83,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_4.0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4400{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_4.0_f16(half addrspace(1)* %out) { store half 4.0, half addrspace(1)* %out ret void } @@ -92,7 +92,7 @@ ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc400{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) { store half -4.0, half addrspace(1)* %out ret void } @@ -101,7 +101,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3118{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) { store half 0xH3118, half addrspace(1)* %out ret void } @@ -110,7 +110,7 @@ ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb118{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) { store half 0xHB118, half addrspace(1)* %out ret void } @@ -118,7 +118,7 @@ ; GCN-LABEL: {{^}}store_literal_imm_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c00 ; GCN: buffer_store_short [[REG]] -define void @store_literal_imm_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_literal_imm_f16(half addrspace(1)* %out) { store half 4096.0, half addrspace(1)* %out ret void } @@ -127,7 +127,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0.0 store half %y, half addrspace(1)* %out ret void @@ -137,7 +137,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0.5 store half %y, half addrspace(1)* %out ret void @@ -147,7 +147,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -0.5 store half %y, half addrspace(1)* %out ret void @@ -157,7 +157,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 1.0 store half %y, half addrspace(1)* %out ret void @@ -167,7 +167,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -1.0 store half %y, half addrspace(1)* %out ret void @@ -177,7 +177,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 2.0 store half %y, half addrspace(1)* %out ret void @@ -187,7 +187,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -2.0 store half %y, half addrspace(1)* %out ret void @@ -197,7 +197,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 4.0 store half %y, half addrspace(1)* %out ret void @@ -207,7 +207,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -4.0 store half %y, half addrspace(1)* %out ret void @@ -217,7 +217,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]] ; VI: buffer_store_short [[REG]] -define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) { +define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %x = load half, half addrspace(1)* %in %y = fadd half %x, 0.5 store half %y, half addrspace(1)* %out @@ -228,7 +228,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0x6400, [[VAL]] ; VI: buffer_store_short [[REG]] -define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) { +define amdgpu_kernel void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %x = load half, half addrspace(1)* %in %y = fadd half %x, 1024.0 store half %y, half addrspace(1)* %out @@ -239,7 +239,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0001 store half %y, half addrspace(1)* %out ret void @@ -249,7 +249,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0002 store half %y, half addrspace(1)* %out ret void @@ -259,7 +259,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0010 store half %y, half addrspace(1)* %out ret void @@ -269,7 +269,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xHFFFF store half %y, half addrspace(1)* %out ret void @@ -279,7 +279,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xHFFFE store half %y, half addrspace(1)* %out ret void @@ -289,7 +289,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -16, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xHFFF0 store half %y, half addrspace(1)* %out ret void @@ -299,7 +299,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]] ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH003F store half %y, half addrspace(1)* %out ret void @@ -309,7 +309,7 @@ ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]] ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0040 store half %y, half addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/immv216.ll =================================================================== --- test/CodeGen/AMDGPU/immv216.ll +++ test/CodeGen/AMDGPU/immv216.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 { store <2 x i16> , <2 x i16> addrspace(1)* %out ret void } @@ -14,7 +14,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 { store <2 x half> , <2 x half> addrspace(1)* %out ret void } @@ -22,7 +22,7 @@ ; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 { store <2 x half> , <2 x half> addrspace(1)* %out ret void } @@ -30,7 +30,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 { store <2 x half> , <2 x half> addrspace(1)* %out ret void } @@ -38,7 +38,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 { store <2 x half> , <2 x half> addrspace(1)* %out ret void } @@ -46,7 +46,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 { store <2 x half> , <2 x half> addrspace(1)* %out ret void } @@ -54,7 +54,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 { store <2 x half> , <2 x half> addrspace(1)* %out ret void } @@ -62,7 +62,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 { store <2 x half> , <2 x half> addrspace(1)* %out ret void } @@ -70,7 +70,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 { store <2 x half> , <2 x half> addrspace(1)* %out ret void } @@ -78,7 +78,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 { store <2 x half> , <2 x half> addrspace(1)* %out ret void } @@ -86,7 +86,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 { store <2 x half> , <2 x half> addrspace(1)* %out ret void } @@ -94,7 +94,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 { store <2 x half> , <2 x half> addrspace(1)* %out ret void } @@ -102,7 +102,7 @@ ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 { store <2 x half> , <2 x half> addrspace(1)* %out ret void } @@ -110,7 +110,7 @@ ; GCN-LABEL: {{^}}store_literal_imm_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00 ; GCN: buffer_store_dword [[REG]] -define void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 { store <2 x half> , <2 x half> addrspace(1)* %out ret void } @@ -126,7 +126,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -143,7 +143,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -160,7 +160,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -177,7 +177,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -194,7 +194,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -211,7 +211,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -228,7 +228,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -245,7 +245,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -262,7 +262,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -280,7 +280,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} ; VI: v_or_b32 ; VI: buffer_store_dword -define void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %x = load <2 x half>, <2 x half> addrspace(1)* %in %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out @@ -301,7 +301,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}} ; VI: v_or_b32 ; VI: buffer_store_dword -define void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %x = load <2 x half>, <2 x half> addrspace(1)* %in %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out @@ -319,7 +319,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -336,7 +336,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -353,7 +353,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -370,7 +370,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -387,7 +387,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -404,7 +404,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -421,7 +421,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void @@ -438,7 +438,7 @@ ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL1]] ; VI: v_or_b32 ; VI: buffer_store_dword -define void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { +define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %y = fadd <2 x half> %x, store <2 x half> %y, <2 x half> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll @@ -10,7 +10,7 @@ ; CHECK: s_mov_b32 m0, [[IN]] ; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]] ; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]: -define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) { entry: %ins = insertelement <4 x float> , float 5.0, i32 %in store <4 x float> %ins, <4 x float> addrspace(1)* %out Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -19,7 +19,7 @@ ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]] ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @extract_w_offset(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) { entry: %idx = add i32 %in, 1 %elt = extractelement <4 x float> , i32 %idx @@ -44,7 +44,7 @@ ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) { +define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) { entry: %idx = add i32 %in, 1 %vec = or <4 x i32> %or.val, @@ -66,7 +66,7 @@ ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]] ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) { entry: %elt = extractelement <4 x float> , i32 %in store float %elt, float addrspace(1)* %out @@ -84,7 +84,7 @@ ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) { +define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) { entry: %index = add i32 %offset, -512 %value = extractelement <4 x i32> , i32 %index @@ -105,7 +105,7 @@ ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) { +define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) { entry: %index = add i32 %offset, -512 %or = or <4 x i32> %vec0, %vec1 @@ -137,7 +137,7 @@ ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword [[RESULT]] -define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) { +define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -512 @@ -147,7 +147,7 @@ } ; GCN-LABEL: {{^}}extract_undef_offset_sgpr: -define void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { entry: %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %value = extractelement <4 x i32> %ld, i32 undef @@ -159,7 +159,7 @@ ; GCN-DAG: buffer_load_dwordx4 ; MOVREL-DAG: s_mov_b32 m0, ; MOVREL: v_movreld_b32 -define void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { entry: %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in %value = insertelement <4 x i32> %ld, i32 5, i32 undef @@ -178,7 +178,7 @@ ; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]] ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}} -define void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) { entry: %0 = add i32 %in, 1 %1 = insertelement <4 x float> , float 5.0, i32 %0 @@ -197,7 +197,7 @@ ; IDXMODE-NEXT: s_set_gpr_idx_off ; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]: -define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) { entry: %0 = insertelement <4 x float> , float 5.0, i32 %in store <4 x float> %0, <4 x float> addrspace(1)* %out @@ -213,7 +213,7 @@ ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) { +define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) { entry: %index = add i32 %offset, -512 %value = insertelement <4 x i32> , i32 5, i32 %index @@ -233,7 +233,7 @@ ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) { +define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) { entry: %index = add i32 %offset, -512 %value = insertelement <4 x i32> %vec, i32 5, i32 %index @@ -270,7 +270,7 @@ ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword -define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -512 @@ -305,7 +305,7 @@ ; GCN: s_cbranch_execnz ; IDXMODE: s_set_gpr_idx_off -define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -16 @@ -375,7 +375,7 @@ ; GCN: buffer_store_dword [[MOVREL0]] ; GCN: buffer_store_dword [[MOVREL1]] -define void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %id.ext = zext i32 %id to i64 @@ -450,7 +450,7 @@ ; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]: ; GCN: buffer_store_dword [[INS0]] -define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 { +define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %id.ext = zext i32 %id to i64 @@ -499,7 +499,7 @@ ; GCN: [[ENDBB]]: ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @extract_adjacent_blocks(i32 %arg) #0 { +define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 { bb: %tmp = icmp eq i32 %arg, 0 br i1 %tmp, label %bb1, label %bb4 @@ -549,7 +549,7 @@ ; GCN: [[ENDBB]]: ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @insert_adjacent_blocks(i32 %arg, float %val0) #0 { +define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 { bb: %tmp = icmp eq i32 %arg, 0 br i1 %tmp, label %bb1, label %bb4 @@ -610,7 +610,7 @@ ; GCN: ds_write_b32 ; GCN: ds_write_b32 ; GCN: s_endpgm -define void @multi_same_block(i32 %arg) #0 { +define amdgpu_kernel void @multi_same_block(i32 %arg) #0 { bb: %tmp1 = add i32 %arg, -16 %tmp2 = insertelement <6 x float> , float 4.000000e+00, i32 %tmp1 @@ -637,7 +637,7 @@ ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword [[EXTRACT]] -define void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { +define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { entry: %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %offset = add i32 %idx, 3 @@ -658,7 +658,7 @@ ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword [[EXTRACT]] -define void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { +define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { entry: %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %offset = add i32 %idx, 4 @@ -681,7 +681,7 @@ ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off -define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) { +define amdgpu_kernel void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) { entry: %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %idx.shl = shl i32 %idx.in, 2 @@ -702,7 +702,7 @@ ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off -define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind { +define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind { %idx.shl = shl i32 %idx.in, 2 %idx = or i32 %idx.shl, 1 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx @@ -729,7 +729,7 @@ ; IDXMODE: s_set_gpr_idx_idx ; IDXMODE: v_mov_b32_e32 ; GCN: s_cbranch_execnz [[REGLOOP]] -define void @broken_phi_bb(i32 %arg, i32 %arg1) #0 { +define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 { bb: br label %bb2 Index: test/CodeGen/AMDGPU/indirect-private-64.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-private-64.ll +++ test/CodeGen/AMDGPU/indirect-private-64.ll @@ -20,7 +20,7 @@ ; SI-PROMOTE: ds_read_b64 ; CI-PROMOTE: ds_write_b64 ; CI-PROMOTE: ds_read_b64 -define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 { +define amdgpu_kernel void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 { %val = load double, double addrspace(1)* %in, align 8 %array = alloca [8 x double], align 8 %ptr = getelementptr inbounds [8 x double], [8 x double]* %array, i32 0, i32 %b @@ -51,7 +51,7 @@ ; SI-PROMOTE: ds_read_b64 ; CI-PROMOTE: ds_write2_b64 ; CI-PROMOTE: ds_read2_b64 -define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 { +define amdgpu_kernel void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16 %array = alloca [4 x <2 x double>], align 16 %ptr = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* %array, i32 0, i32 %b @@ -77,7 +77,7 @@ ; SI-PROMOTE: ds_read_b64 ; CI-PROMOTE: ds_write_b64 ; CI-PROMOTE: ds_read_b64 -define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 { +define amdgpu_kernel void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 { %val = load i64, i64 addrspace(1)* %in, align 8 %array = alloca [8 x i64], align 8 %ptr = getelementptr inbounds [8 x i64], [8 x i64]* %array, i32 0, i32 %b @@ -109,7 +109,7 @@ ; SI-PROMOTE: ds_read_b64 ; CI-PROMOTE: ds_write2_b64 ; CI-PROMOTE: ds_read2_b64 -define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 { +define amdgpu_kernel void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 %array = alloca [4 x <2 x i64>], align 16 %ptr = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* %array, i32 0, i32 %b Index: test/CodeGen/AMDGPU/infinite-loop-evergreen.ll =================================================================== --- test/CodeGen/AMDGPU/infinite-loop-evergreen.ll +++ test/CodeGen/AMDGPU/infinite-loop-evergreen.ll @@ -2,7 +2,7 @@ ; REQUIRES: asserts ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s -define void @inf_loop_irreducible_cfg() nounwind { +define amdgpu_kernel void @inf_loop_irreducible_cfg() nounwind { entry: br label %block Index: test/CodeGen/AMDGPU/infinite-loop.ll =================================================================== --- test/CodeGen/AMDGPU/infinite-loop.ll +++ test/CodeGen/AMDGPU/infinite-loop.ll @@ -7,7 +7,7 @@ ; SI: buffer_store_dword [[REG]] ; SI: s_waitcnt vmcnt(0) expcnt(0) ; SI: s_branch BB0_1 -define void @infinite_loop(i32 addrspace(1)* %out) { +define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) { entry: br label %for.body Index: test/CodeGen/AMDGPU/inline-asm.ll =================================================================== --- test/CodeGen/AMDGPU/inline-asm.ll +++ test/CodeGen/AMDGPU/inline-asm.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: {{^}}inline_asm: ; CHECK: s_endpgm ; CHECK: s_endpgm -define void @inline_asm(i32 addrspace(1)* %out) { +define amdgpu_kernel void @inline_asm(i32 addrspace(1)* %out) { entry: store i32 5, i32 addrspace(1)* %out call void asm sideeffect "s_endpgm", ""() @@ -25,7 +25,7 @@ ; Make sure inline assembly is treted as divergent. ; CHECK: s_mov_b32 s{{[0-9]+}}, 0 ; CHECK: s_and_saveexec_b64 -define void @branch_on_asm(i32 addrspace(1)* %out) { +define amdgpu_kernel void @branch_on_asm(i32 addrspace(1)* %out) { %zero = call i32 asm "s_mov_b32 $0, 0", "=s"() %cmp = icmp eq i32 %zero, 0 br i1 %cmp, label %if, label %endif @@ -44,7 +44,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[MASK_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[MASK_HI]] ; CHECK: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) { %sgpr = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 %in) store i64 %sgpr, i64 addrspace(1)* %out ret void @@ -52,7 +52,7 @@ ; CHECK-LABEL: {{^}}code_size_inline_asm: ; CHECK: codeLenInByte = 12 -define void @code_size_inline_asm(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm(i32 addrspace(1)* %out) { entry: call void asm sideeffect "v_nop_e64", ""() ret void @@ -61,7 +61,7 @@ ; All inlineasm instructions are assumed to be the maximum size ; CHECK-LABEL: {{^}}code_size_inline_asm_small_inst: ; CHECK: codeLenInByte = 12 -define void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) { entry: call void asm sideeffect "v_nop_e32", ""() ret void @@ -69,7 +69,7 @@ ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst: ; CHECK: codeLenInByte = 20 -define void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) { entry: call void asm sideeffect " v_nop_e64 @@ -80,7 +80,7 @@ ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst_extra_newline: ; CHECK: codeLenInByte = 20 -define void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) { entry: call void asm sideeffect " v_nop_e64 @@ -92,7 +92,7 @@ ; CHECK-LABEL: {{^}}code_size_inline_asm_0_inst: ; CHECK: codeLenInByte = 4 -define void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) { entry: call void asm sideeffect "", ""() ret void @@ -100,7 +100,7 @@ ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment: ; CHECK: codeLenInByte = 4 -define void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) { entry: call void asm sideeffect "; comment", ""() ret void @@ -108,7 +108,7 @@ ; CHECK-LABEL: {{^}}code_size_inline_asm_newline_1_comment: ; CHECK: codeLenInByte = 4 -define void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) { entry: call void asm sideeffect " ; comment", ""() @@ -117,7 +117,7 @@ ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment_newline: ; CHECK: codeLenInByte = 4 -define void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) { entry: call void asm sideeffect "; comment ", ""() @@ -126,7 +126,7 @@ ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line: ; CHECK: codeLenInByte = 4 -define void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) { entry: call void asm sideeffect "; first comment ; second comment", ""() ret void @@ -134,7 +134,7 @@ ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line_nospace: ; CHECK: codeLenInByte = 4 -define void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) { entry: call void asm sideeffect "; first comment;second comment", ""() ret void @@ -142,7 +142,7 @@ ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments0: ; CHECK: codeLenInByte = 20 -define void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) { entry: call void asm sideeffect "; comment v_nop_e64 ; inline comment @@ -157,7 +157,7 @@ ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments1: ; CHECK: codeLenInByte = 20 -define void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) { entry: call void asm sideeffect "v_nop_e64 ; inline comment ; separate comment @@ -171,7 +171,7 @@ ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments_operands: ; CHECK: codeLenInByte = 20 -define void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) { entry: call void asm sideeffect "; comment v_add_i32_e32 v0, vcc, v1, v2 ; inline comment Index: test/CodeGen/AMDGPU/inline-calls.ll =================================================================== --- test/CodeGen/AMDGPU/inline-calls.ll +++ test/CodeGen/AMDGPU/inline-calls.ll @@ -11,7 +11,7 @@ ; CHECK: {{^}}kernel: ; CHECK-NOT: call -define void @kernel(i32 addrspace(1)* %out) { +define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) { entry: %tmp0 = call i32 @func(i32 1) store i32 %tmp0, i32 addrspace(1)* %out @@ -20,7 +20,7 @@ ; CHECK: {{^}}kernel2: ; CHECK-NOT: call -define void @kernel2(i32 addrspace(1)* %out) { +define amdgpu_kernel void @kernel2(i32 addrspace(1)* %out) { entry: call void @kernel(i32 addrspace(1)* %out) ret void @@ -31,7 +31,7 @@ ; CHECK: {{^}}kernel3: ; CHECK-NOT: call -define void @kernel3(i32 addrspace(1)* %out) { +define amdgpu_kernel void @kernel3(i32 addrspace(1)* %out) { entry: %tmp0 = call i32 @func_alias(i32 1) store i32 %tmp0, i32 addrspace(1)* %out @@ -43,7 +43,7 @@ ; CHECK: {{^}}kernel4: ; CHECK-NOT: call -define void @kernel4(i32 addrspace(1)* %out) { +define amdgpu_kernel void @kernel4(i32 addrspace(1)* %out) { entry: call void @kernel_alias(i32 addrspace(1)* %out) ret void Index: test/CodeGen/AMDGPU/inline-constraints.ll =================================================================== --- test/CodeGen/AMDGPU/inline-constraints.ll +++ test/CodeGen/AMDGPU/inline-constraints.ll @@ -10,7 +10,7 @@ ; GCN: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] ; GCN: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] -define void @inline_reg_constraints(i32 addrspace(1)* %ptr) { +define amdgpu_kernel void @inline_reg_constraints(i32 addrspace(1)* %ptr) { entry: %v32 = tail call i32 asm sideeffect "flat_load_dword $0, $1", "=v,v"(i32 addrspace(1)* %ptr) %v64 = tail call <2 x i32> asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(i32 addrspace(1)* %ptr) @@ -27,7 +27,7 @@ ; GCN: s_mov_b32 m0, -1 ; GCN: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 ; GCN: ; use [[COPY_M0]] -define void @inline_sreg_constraint_m0() { +define amdgpu_kernel void @inline_sreg_constraint_m0() { %m0 = tail call i32 asm sideeffect "s_mov_b32 m0, -1", "={M0}"() tail call void asm sideeffect "; use $0", "s"(i32 %m0) ret void @@ -36,7 +36,7 @@ ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i32: ; GCN: s_mov_b32 [[REG:s[0-9]+]], 32 ; GCN: ; use [[REG]] -define void @inline_sreg_constraint_imm_i32() { +define amdgpu_kernel void @inline_sreg_constraint_imm_i32() { tail call void asm sideeffect "; use $0", "s"(i32 32) ret void } @@ -44,7 +44,7 @@ ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f32: ; GCN: s_mov_b32 [[REG:s[0-9]+]], 1.0 ; GCN: ; use [[REG]] -define void @inline_sreg_constraint_imm_f32() { +define amdgpu_kernel void @inline_sreg_constraint_imm_f32() { tail call void asm sideeffect "; use $0", "s"(float 1.0) ret void } @@ -54,7 +54,7 @@ ; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}} ; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}} ; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}} -define void @inline_sreg_constraint_imm_i64() { +define amdgpu_kernel void @inline_sreg_constraint_imm_i64() { tail call void asm sideeffect "; use $0", "s"(i64 -4) ret void } @@ -63,7 +63,7 @@ ; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}} ; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}} ; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}} -define void @inline_sreg_constraint_imm_f64() { +define amdgpu_kernel void @inline_sreg_constraint_imm_f64() { tail call void asm sideeffect "; use $0", "s"(double 1.0) ret void } Index: test/CodeGen/AMDGPU/inlineasm-16.ll =================================================================== --- test/CodeGen/AMDGPU/inlineasm-16.ll +++ test/CodeGen/AMDGPU/inlineasm-16.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}s_input_output_i16: ; SICI: error: couldn't allocate output register for constraint 's' ; SICI: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_i16() #0 { +define amdgpu_kernel void @s_input_output_i16() #0 { %v = tail call i16 asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(i16 %v) #0 ret void @@ -14,7 +14,7 @@ ; GCN-LABEL: {{^}}v_input_output_i16: ; SICI: error: couldn't allocate output register for constraint 'v' ; SICI: error: couldn't allocate input reg for constraint 'v' -define void @v_input_output_i16() #0 { +define amdgpu_kernel void @v_input_output_i16() #0 { %v = tail call i16 asm sideeffect "v_mov_b32 $0, -1", "=v"() #0 tail call void asm sideeffect "; use $0", "v"(i16 %v) ret void @@ -23,7 +23,7 @@ ; GCN-LABEL: {{^}}s_input_output_f16: ; SICI: error: couldn't allocate output register for constraint 's' ; SICI: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_f16() #0 { +define amdgpu_kernel void @s_input_output_f16() #0 { %v = tail call half asm sideeffect "s_mov_b32 $0, -1", "=s"() #0 tail call void asm sideeffect "; use $0", "s"(half %v) ret void @@ -32,7 +32,7 @@ ; GCN-LABEL: {{^}}v_input_output_f16: ; SICI: error: couldn't allocate output register for constraint 'v' ; SICI: error: couldn't allocate input reg for constraint 'v' -define void @v_input_output_f16() #0 { +define amdgpu_kernel void @v_input_output_f16() #0 { %v = tail call half asm sideeffect "v_mov_b32 $0, -1", "=v"() #0 tail call void asm sideeffect "; use $0", "v"(half %v) ret void Index: test/CodeGen/AMDGPU/inlineasm-illegal-type.ll =================================================================== --- test/CodeGen/AMDGPU/inlineasm-illegal-type.ll +++ test/CodeGen/AMDGPU/inlineasm-illegal-type.ll @@ -3,7 +3,7 @@ ; GCN: error: couldn't allocate output register for constraint 's' ; GCN: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_i8() { +define amdgpu_kernel void @s_input_output_i8() { %v = tail call i8 asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(i8 %v) ret void @@ -11,7 +11,7 @@ ; GCN: error: couldn't allocate output register for constraint 'v' ; GCN: error: couldn't allocate input reg for constraint 'v' -define void @v_input_output_i8() { +define amdgpu_kernel void @v_input_output_i8() { %v = tail call i8 asm sideeffect "v_mov_b32 $0, -1", "=v"() tail call void asm sideeffect "; use $0", "v"(i8 %v) ret void @@ -19,7 +19,7 @@ ; GCN: error: couldn't allocate output register for constraint 's' ; GCN: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_i128() { +define amdgpu_kernel void @s_input_output_i128() { %v = tail call i128 asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(i128 %v) ret void @@ -27,7 +27,7 @@ ; GCN: error: couldn't allocate output register for constraint 's' ; GCN: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_v8f16() { +define amdgpu_kernel void @s_input_output_v8f16() { %v = tail call <8 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(<8 x half> %v) ret void @@ -36,7 +36,7 @@ ; CI: error: couldn't allocate output register for constraint 's' ; CI: error: couldn't allocate input reg for constraint 's' ; VI-NOT: error -define void @s_input_output_f16() { +define amdgpu_kernel void @s_input_output_f16() { %v = tail call half asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(half %v) ret void @@ -44,7 +44,7 @@ ; GCN: error: couldn't allocate output register for constraint 's' ; GCN: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_v2f16() { +define amdgpu_kernel void @s_input_output_v2f16() { %v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(<2 x half> %v) ret void @@ -52,7 +52,7 @@ ; GCN: error: couldn't allocate output register for constraint 'v' ; GCN: error: couldn't allocate input reg for constraint 'v' -define void @v_input_output_v2f16() { +define amdgpu_kernel void @v_input_output_v2f16() { %v = tail call <2 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"() tail call void asm sideeffect "; use $0", "v"(<2 x half> %v) ret void @@ -61,7 +61,7 @@ ; CI: error: couldn't allocate output register for constraint 's' ; CI: error: couldn't allocate input reg for constraint 's' ; VI-NOT: error -define void @s_input_output_i16() { +define amdgpu_kernel void @s_input_output_i16() { %v = tail call i16 asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(i16 %v) ret void @@ -69,14 +69,14 @@ ; GCN: error: couldn't allocate output register for constraint 's' ; GCN: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_v2i16() { +define amdgpu_kernel void @s_input_output_v2i16() { %v = tail call <2 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(<2 x i16> %v) ret void } ; FIXME: Crash in codegen prepare -; define void @s_input_output_i3() { +; define amdgpu_kernel void @s_input_output_i3() { ; %v = tail call i3 asm sideeffect "s_mov_b32 $0, -1", "=s"() ; tail call void asm sideeffect "; use $0", "s"(i3 %v) ; ret void Index: test/CodeGen/AMDGPU/inlineasm-packed.ll =================================================================== --- test/CodeGen/AMDGPU/inlineasm-packed.ll +++ test/CodeGen/AMDGPU/inlineasm-packed.ll @@ -2,7 +2,7 @@ ; GCN-LABEL: {{^}}inline_asm_input_v2i16: ; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} -define void @inline_asm_input_v2i16(i32 addrspace(1)* %out, <2 x i16> %in) #0 { +define amdgpu_kernel void @inline_asm_input_v2i16(i32 addrspace(1)* %out, <2 x i16> %in) #0 { entry: %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x i16> %in) #0 store i32 %val, i32 addrspace(1)* %out @@ -11,7 +11,7 @@ ; GCN-LABEL: {{^}}inline_asm_input_v2f16: ; GCN: s_mov_b32 s0, s{{[0-9]+}} -define void @inline_asm_input_v2f16(i32 addrspace(1)* %out, <2 x half> %in) #0 { +define amdgpu_kernel void @inline_asm_input_v2f16(i32 addrspace(1)* %out, <2 x half> %in) #0 { entry: %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x half> %in) #0 store i32 %val, i32 addrspace(1)* %out @@ -20,7 +20,7 @@ ; GCN-LABEL: {{^}}inline_asm_output_v2i16: ; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} -define void @inline_asm_output_v2i16(<2 x i16> addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @inline_asm_output_v2i16(<2 x i16> addrspace(1)* %out, i32 %in) #0 { entry: %val = call <2 x i16> asm "s_mov_b32 $0, $1", "=r,r"(i32 %in) #0 store <2 x i16> %val, <2 x i16> addrspace(1)* %out @@ -29,7 +29,7 @@ ; GCN-LABEL: {{^}}inline_asm_output_v2f16: ; GCN: v_mov_b32 v{{[0-9]+}}, s{{[0-9]+}} -define void @inline_asm_output_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @inline_asm_output_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { entry: %val = call <2 x half> asm "v_mov_b32 $0, $1", "=v,r"(i32 %in) #0 store <2 x half> %val, <2 x half> addrspace(1)* %out @@ -38,7 +38,7 @@ ; GCN-LABEL: {{^}}inline_asm_packed_v2i16: ; GCN: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -define void @inline_asm_packed_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %in0, <2 x i16> %in1) #0 { +define amdgpu_kernel void @inline_asm_packed_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %in0, <2 x i16> %in1) #0 { entry: %val = call <2 x i16> asm "v_pk_add_u16 $0, $1, $2", "=v,r,v"(<2 x i16> %in0, <2 x i16> %in1) #0 store <2 x i16> %val, <2 x i16> addrspace(1)* %out @@ -47,7 +47,7 @@ ; GCN-LABEL: {{^}}inline_asm_packed_v2f16: ; GCN: v_pk_add_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -define void @inline_asm_packed_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in0, <2 x half> %in1) #0 { +define amdgpu_kernel void @inline_asm_packed_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in0, <2 x half> %in1) #0 { entry: %val = call <2 x half> asm "v_pk_add_f16 $0, $1, $2", "=v,r,v"(<2 x half> %in0, <2 x half> %in1) #0 store <2 x half> %val, <2 x half> addrspace(1)* %out Index: test/CodeGen/AMDGPU/insert_subreg.ll =================================================================== --- test/CodeGen/AMDGPU/insert_subreg.ll +++ test/CodeGen/AMDGPU/insert_subreg.ll @@ -6,7 +6,7 @@ ; Make sure this doesn't crash ; CHECK-LABEL: test: -define void @test(i64 addrspace(1)* %out) { +define amdgpu_kernel void @test(i64 addrspace(1)* %out) { entry: %tmp0 = alloca [16 x i32] %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32 Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -18,56 +18,56 @@ ; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000 ; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]] ; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]: -define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v4f32_1: -define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v4f32_2: -define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v4f32_3: -define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v4i32_0: -define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { +define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { %vecins = insertelement <4 x i32> %a, i32 999, i32 0 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v3f32_1: -define void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v3f32_2: -define void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v3f32_3: -define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 ret void @@ -86,7 +86,7 @@ ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] ; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: -define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8 ret void @@ -97,7 +97,7 @@ ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: ; GCN-DAG: buffer_store_dword v -define void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 ret void @@ -107,7 +107,7 @@ ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] ; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]: -define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 ret void @@ -117,7 +117,7 @@ ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 ret void @@ -129,7 +129,7 @@ ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 ret void @@ -138,7 +138,7 @@ ; GCN-LABEL: {{^}}dynamic_insertelement_v2i32: ; GCN: v_movreld_b32 ; GCN: buffer_store_dwordx2 -define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { %vecins = insertelement <2 x i32> %a, i32 5, i32 %b store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 ret void @@ -148,7 +148,7 @@ ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: ; GCN-DAG: buffer_store_dword v -define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { %vecins = insertelement <3 x i32> %a, i32 5, i32 %b store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 ret void @@ -159,7 +159,7 @@ ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]] ; GCN: buffer_store_dwordx4 -define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind { %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 ret void @@ -169,7 +169,7 @@ ; GCN: v_movreld_b32 ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { %vecins = insertelement <8 x i32> %a, i32 5, i32 %b store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 ret void @@ -181,21 +181,21 @@ ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { %vecins = insertelement <16 x i32> %a, i32 5, i32 %b store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64 ret void } ; GCN-LABEL: {{^}}dynamic_insertelement_v2i16: -define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { %vecins = insertelement <2 x i16> %a, i16 5, i32 %b store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8 ret void } ; GCN-LABEL: {{^}}dynamic_insertelement_v3i16: -define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind { %vecins = insertelement <3 x i16> %a, i16 5, i32 %b store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8 ret void @@ -222,7 +222,7 @@ ; GCN: buffer_load_dwordx2 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off -define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind { %vecins = insertelement <4 x i16> %a, i16 5, i32 %b store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8 ret void @@ -242,7 +242,7 @@ ; GCN-TONGA: buffer_load_ushort ; GCN: buffer_store_short v{{[0-9]+}}, off -define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { %vecins = insertelement <2 x i8> %a, i8 5, i32 %b store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 ret void @@ -265,7 +265,7 @@ ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off -define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind { %vecins = insertelement <3 x i8> %a, i8 5, i32 %b store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4 ret void @@ -291,21 +291,21 @@ ; GCN-TONGA: buffer_load_dword ; GCN: buffer_store_dword v{{[0-9]+}}, off -define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { %vecins = insertelement <4 x i8> %a, i8 5, i32 %b store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}dynamic_insertelement_v8i8: -define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind { %vecins = insertelement <8 x i8> %a, i8 5, i32 %b store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8 ret void } ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8: -define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { %vecins = insertelement <16 x i8> %a, i8 5, i32 %b store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16 ret void @@ -314,7 +314,7 @@ ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that ; the compiler doesn't crash. ; GCN-LABEL: {{^}}insert_split_bb: -define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { +define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { entry: %0 = insertelement <2 x i32> undef, i32 %a, i32 0 %1 = icmp eq i32 %a, 0 @@ -361,7 +361,7 @@ ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind { %vecins = insertelement <2 x double> %a, double 8.0, i32 %b store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 ret void @@ -374,14 +374,14 @@ ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { %vecins = insertelement <2 x i64> %a, i64 5, i32 %b store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 ret void } ; GCN-LABEL: {{^}}dynamic_insertelement_v3i64: -define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { %vecins = insertelement <3 x i64> %a, i64 5, i32 %b store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32 ret void @@ -411,7 +411,7 @@ ; GCN: s_endpgm ; GCN: ScratchSize: 64 -define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { %vecins = insertelement <4 x double> %a, double 8.0, i32 %b store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 ret void @@ -438,7 +438,7 @@ ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm ; GCN: ScratchSize: 128 -define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind { %vecins = insertelement <8 x double> %a, double 8.0, i32 %b store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 ret void Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -10,7 +10,7 @@ ; GFX9-NOT: lshr ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, 0x3e7, [[VEC]] -define void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { +define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out @@ -28,7 +28,7 @@ ; GFX9-NOT: [[ELT0]] ; GFX9-NOT: [[VEC]] ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]] -define void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { +define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out @@ -48,7 +48,7 @@ ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]] ; GFX9-DAG: ; use [[ELT1]] -define void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { +define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %elt1 = extractelement <2 x i16> %vec, i32 1 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 @@ -68,7 +68,7 @@ ; GFX9-NOT: [[ELT0]] ; GFX9-NOT: [[VEC]] ; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]] -define void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { +define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 @@ -87,7 +87,7 @@ ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT1]], [[VEC]] ; GFX9: ; use [[ELT1]] -define void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { +define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 @@ -112,7 +112,7 @@ ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]] ; GFX9: ; use [[ELT_HI]] ; GFX9: ; use [[VEC_HI]] -define void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { +define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 @@ -136,7 +136,7 @@ ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x3e7 -define void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { +define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out @@ -152,7 +152,7 @@ ; GCN-NOT: shlr ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]] -define void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { +define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out @@ -166,7 +166,7 @@ ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, 0x4500, [[ELT1]] -define void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { +define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 store <2 x half> %vecins, <2 x half> addrspace(1)* %out @@ -181,7 +181,7 @@ ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x4500 -define void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { +define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 store <2 x half> %vecins, <2 x half> addrspace(1)* %out @@ -197,7 +197,7 @@ ; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} ; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], [[ELT0]], [[VEC]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] -define void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -221,7 +221,7 @@ ; GFX9: v_and_or_b32 [[RES:v[0-9]+]], [[VEC]], [[MASK]], [[ELT0_SHIFT]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] -define void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 { +define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -244,7 +244,7 @@ ; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], 53, [[VEC]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] -define void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -266,7 +266,7 @@ ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] -define void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -283,7 +283,7 @@ ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]] ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] -define void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -305,7 +305,7 @@ ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, [[ELT0]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] -define void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext @@ -325,7 +325,7 @@ ; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]] ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, 53 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] -define void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext @@ -345,7 +345,7 @@ ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] -define void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext @@ -362,7 +362,7 @@ ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]] ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] -define void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext @@ -383,7 +383,7 @@ ; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VVEC]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 { +define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 { %idx = load volatile i32, i32 addrspace(2)* %idx.ptr %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx @@ -399,7 +399,7 @@ ; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 { +define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -424,7 +424,7 @@ ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { +define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -451,7 +451,7 @@ ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { +define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext Index: test/CodeGen/AMDGPU/inserted-wait-states.mir =================================================================== --- test/CodeGen/AMDGPU/inserted-wait-states.mir +++ test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -4,17 +4,17 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI,GFX9 --- | - define void @div_fmas() { ret void } - define void @s_getreg() { ret void } - define void @s_setreg() { ret void } - define void @vmem_gt_8dw_store() { ret void } - define void @readwrite_lane() { ret void } - define void @rfe() { ret void } - define void @s_mov_fed_b32() { ret void } - define void @s_movrel() { ret void } - define void @v_interp() { ret void } - - define void @mov_fed_hazard_crash_on_dbg_value(i32 addrspace(1)* %A) { + define amdgpu_kernel void @div_fmas() { ret void } + define amdgpu_kernel void @s_getreg() { ret void } + define amdgpu_kernel void @s_setreg() { ret void } + define amdgpu_kernel void @vmem_gt_8dw_store() { ret void } + define amdgpu_kernel void @readwrite_lane() { ret void } + define amdgpu_kernel void @rfe() { ret void } + define amdgpu_kernel void @s_mov_fed_b32() { ret void } + define amdgpu_kernel void @s_movrel() { ret void } + define amdgpu_kernel void @v_interp() { ret void } + + define amdgpu_kernel void @mov_fed_hazard_crash_on_dbg_value(i32 addrspace(1)* %A) { entry: %A.addr = alloca i32 addrspace(1)*, align 4 store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 Index: test/CodeGen/AMDGPU/internalize.ll =================================================================== --- test/CodeGen/AMDGPU/internalize.ll +++ test/CodeGen/AMDGPU/internalize.ll @@ -8,14 +8,14 @@ @gvar_used = addrspace(1) global i32 undef, align 4 ; Function Attrs: alwaysinline nounwind -define void @foo_unused(i32 addrspace(1)* %out) local_unnamed_addr #1 { +define amdgpu_kernel void @foo_unused(i32 addrspace(1)* %out) local_unnamed_addr #1 { entry: store i32 1, i32 addrspace(1)* %out ret void } ; Function Attrs: alwaysinline nounwind -define void @foo_used(i32 addrspace(1)* %out, i32 %tid) local_unnamed_addr #1 { +define amdgpu_kernel void @foo_used(i32 addrspace(1)* %out, i32 %tid) local_unnamed_addr #1 { entry: store i32 %tid, i32 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/invalid-addrspacecast.ll =================================================================== --- test/CodeGen/AMDGPU/invalid-addrspacecast.ll +++ test/CodeGen/AMDGPU/invalid-addrspacecast.ll @@ -1,7 +1,7 @@ ; RUN: not llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; ERROR: error: :0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast -define void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) { +define amdgpu_kernel void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) { %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(1)* store volatile i32 0, i32 addrspace(1)* %stof ret void Index: test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll =================================================================== --- test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll +++ test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll @@ -10,7 +10,7 @@ ; GCN-DAG: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]], ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b ; GCN: buffer_store_dword [[K]], [[PTR]] -define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 { +define amdgpu_kernel void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 { %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0 %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1 store i16 123, i16 addrspace(1)* %ptr, align 4 @@ -22,7 +22,7 @@ ; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}} ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b ; GCN: buffer_store_dword [[K]], off, s{{\[}}[[SPTR_LO]]: -define void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 { +define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 { %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0 %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1 store i16 123, i16 addrspace(1)* %ptr, align 4 Index: test/CodeGen/AMDGPU/invert-br-undef-vcc.mir =================================================================== --- test/CodeGen/AMDGPU/invert-br-undef-vcc.mir +++ test/CodeGen/AMDGPU/invert-br-undef-vcc.mir @@ -1,7 +1,7 @@ # RUN: llc -run-pass block-placement -march=amdgcn -verify-machineinstrs -o - %s | FileCheck %s --- | - define void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 { + define amdgpu_kernel void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 { entry: br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0 Index: test/CodeGen/AMDGPU/kcache-fold.ll =================================================================== --- test/CodeGen/AMDGPU/kcache-fold.ll +++ test/CodeGen/AMDGPU/kcache-fold.ll @@ -2,7 +2,7 @@ ; CHECK: {{^}}main1: ; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}} -define void @main1() #0 { +define amdgpu_kernel void @main1() #0 { main_body: %tmp = load <4 x float>, <4 x float> addrspace(8)* null %tmp7 = extractelement <4 x float> %tmp, i32 0 @@ -54,7 +54,7 @@ ; CHECK: {{^}}main2: ; CHECK-NOT: MOV -define void @main2() #0 { +define amdgpu_kernel void @main2() #0 { main_body: %tmp = load <4 x float>, <4 x float> addrspace(8)* null %tmp7 = extractelement <4 x float> %tmp, i32 0 Index: test/CodeGen/AMDGPU/kernarg-stack-alignment.ll =================================================================== --- test/CodeGen/AMDGPU/kernarg-stack-alignment.ll +++ test/CodeGen/AMDGPU/kernarg-stack-alignment.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: {{^}}no_args: ; CHECK: ScratchSize: 5{{$}} -define void @no_args() { +define amdgpu_kernel void @no_args() { %alloca = alloca i8 store volatile i8 0, i8* %alloca ret void @@ -13,7 +13,7 @@ ; CHECK-LABEL: {{^}}force_align32: ; CHECK: ScratchSize: 5{{$}} -define void @force_align32(<8 x i32>) { +define amdgpu_kernel void @force_align32(<8 x i32>) { %alloca = alloca i8 store volatile i8 0, i8* %alloca ret void @@ -21,7 +21,7 @@ ; CHECK-LABEL: {{^}}force_align64: ; CHECK: ScratchSize: 5{{$}} -define void @force_align64(<16 x i32>) { +define amdgpu_kernel void @force_align64(<16 x i32>) { %alloca = alloca i8 store volatile i8 0, i8* %alloca ret void @@ -29,7 +29,7 @@ ; CHECK-LABEL: {{^}}force_align128: ; CHECK: ScratchSize: 5{{$}} -define void @force_align128(<32 x i32>) { +define amdgpu_kernel void @force_align128(<32 x i32>) { %alloca = alloca i8 store volatile i8 0, i8* %alloca ret void @@ -37,7 +37,7 @@ ; CHECK-LABEL: {{^}}force_align256: ; CHECK: ScratchSize: 5{{$}} -define void @force_align256(<64 x i32>) { +define amdgpu_kernel void @force_align256(<64 x i32>) { %alloca = alloca i8 store volatile i8 0, i8* %alloca ret void Index: test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- test/CodeGen/AMDGPU/kernel-args.ll +++ test/CodeGen/AMDGPU/kernel-args.ll @@ -17,7 +17,7 @@ ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] -define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { +define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { entry: %0 = zext i8 %in to i32 store i32 %0, i32 addrspace(1)* %out, align 4 @@ -36,7 +36,7 @@ ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] -define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { +define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { entry: %0 = zext i8 %in to i32 store i32 %0, i32 addrspace(1)* %out, align 4 @@ -55,7 +55,7 @@ ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] -define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { +define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { entry: %0 = sext i8 %in to i32 store i32 %0, i32 addrspace(1)* %out, align 4 @@ -75,7 +75,7 @@ ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] -define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { +define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { entry: %0 = zext i16 %in to i32 store i32 %0, i32 addrspace(1)* %out, align 4 @@ -94,7 +94,7 @@ ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] -define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { +define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { entry: %0 = zext i16 %in to i32 store i32 %0, i32 addrspace(1)* %out, align 4 @@ -113,7 +113,7 @@ ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] -define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { +define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { entry: %0 = sext i16 %in to i32 store i32 %0, i32 addrspace(1)* %out, align 4 @@ -126,7 +126,7 @@ ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c ; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8 -define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { +define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { entry: store i32 %in, i32 addrspace(1)* %out, align 4 ret void @@ -138,7 +138,7 @@ ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 -define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { +define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { entry: store float %in, float addrspace(1)* %out, align 4 ret void @@ -152,7 +152,7 @@ ; MESA-GCN: buffer_load_ubyte ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte -define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { +define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { entry: store <2 x i8> %in, <2 x i8> addrspace(1)* %out ret void @@ -166,7 +166,7 @@ ; MESA-GCN: buffer_load_ushort ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort -define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { +define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { entry: store <2 x i16> %in, <2 x i16> addrspace(1)* %out ret void @@ -179,7 +179,7 @@ ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 -define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { +define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { entry: store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 ret void @@ -192,7 +192,7 @@ ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c ; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 -define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { +define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { entry: store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 ret void @@ -209,7 +209,7 @@ ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte -define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { +define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { entry: store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 ret void @@ -226,7 +226,7 @@ ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort -define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { +define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { entry: store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 ret void @@ -239,7 +239,7 @@ ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 -define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { +define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { entry: store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 ret void @@ -253,7 +253,7 @@ ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 -define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { +define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { entry: store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 ret void @@ -273,7 +273,7 @@ ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte -define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { +define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(1)* %out ret void @@ -293,7 +293,7 @@ ; HSA-GCN: flat_load_ushort ; HSA-GCN: flat_load_ushort ; HSA-GCN: flat_load_ushort -define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { +define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { entry: store <4 x i16> %in, <4 x i16> addrspace(1)* %out ret void @@ -308,7 +308,7 @@ ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 -define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { +define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { entry: store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 ret void @@ -323,7 +323,7 @@ ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 -define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { +define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { entry: store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 ret void @@ -354,7 +354,7 @@ ; HSA-GCN: float_load_ubyte ; HSA-GCN: float_load_ubyte ; HSA-GCN: float_load_ubyte -define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { +define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { entry: store <8 x i8> %in, <8 x i8> addrspace(1)* %out ret void @@ -386,7 +386,7 @@ ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort -define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { +define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { entry: store <8 x i16> %in, <8 x i16> addrspace(1)* %out ret void @@ -405,7 +405,7 @@ ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 ; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 -define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { +define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { entry: store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 ret void @@ -422,7 +422,7 @@ ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 -define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { +define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { entry: store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 ret void @@ -478,7 +478,7 @@ ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte -define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { +define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { entry: store <16 x i8> %in, <16 x i8> addrspace(1)* %out ret void @@ -534,7 +534,7 @@ ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort -define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { +define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { entry: store <16 x i16> %in, <16 x i16> addrspace(1)* %out ret void @@ -561,7 +561,7 @@ ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 -define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { +define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { entry: store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 ret void @@ -588,7 +588,7 @@ ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 -define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { +define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { entry: store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 ret void @@ -599,7 +599,7 @@ ; MESA-GCN: s_load_dwordx2 ; MESA-GCN: buffer_store_dwordx2 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 -define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { +define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { store i64 %a, i64 addrspace(1)* %out, align 8 ret void } @@ -611,7 +611,7 @@ ; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c ; MESA-GCN: buffer_store_dwordx2 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 -define void @f64_kernel_arg(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { entry: store double %in, double addrspace(1)* %out ret void @@ -621,7 +621,7 @@ ; XGCN: s_load_dwordx2 ; XGCN: s_load_dwordx2 ; XGCN: buffer_store_dwordx2 -; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { +; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { ; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 ; ret void ; } @@ -631,7 +631,7 @@ ; SI: v_and_b32_e32 ; SI: buffer_store_byte ; SI: s_endpgm -define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { +define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { store i1 %x, i1 addrspace(1)* %out, align 1 ret void } @@ -640,7 +640,7 @@ ; SI: buffer_load_ubyte ; SI: buffer_store_dword ; SI: s_endpgm -define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { +define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { %ext = zext i1 %x to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 ret void @@ -650,7 +650,7 @@ ; SI: buffer_load_ubyte ; SI: buffer_store_dwordx2 ; SI: s_endpgm -define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { +define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { %ext = zext i1 %x to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 ret void @@ -660,7 +660,7 @@ ; SI: buffer_load_ubyte ; SI: buffer_store_dword ; SI: s_endpgm -define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { +define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { %ext = sext i1 %x to i32 store i32 %ext, i32addrspace(1)* %out, align 4 ret void @@ -672,7 +672,7 @@ ; SI: v_ashrrev_i32 ; SI: buffer_store_dwordx2 ; SI: s_endpgm -define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { +define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { %ext = sext i1 %x to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 ret void Index: test/CodeGen/AMDGPU/large-alloca-compute.ll =================================================================== --- test/CodeGen/AMDGPU/large-alloca-compute.ll +++ test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -48,7 +48,7 @@ ; Scratch size = alloca size + emergency stack slot ; ALL: ; ScratchSize: 32772 -define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 { +define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 { %large = alloca [8192 x i32], align 4 %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 store volatile i32 %x, i32* %gep Index: test/CodeGen/AMDGPU/large-constant-initializer.ll =================================================================== --- test/CodeGen/AMDGPU/large-constant-initializer.ll +++ test/CodeGen/AMDGPU/large-constant-initializer.ll @@ -4,7 +4,7 @@ @gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4 -define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind { +define amdgpu_kernel void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind { %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4 %mul12 = mul nsw i32 %val, 7 br i1 undef, label %exit, label %bb Index: test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll =================================================================== --- test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll +++ test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll @@ -4,7 +4,7 @@ ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 -define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -26,7 +26,7 @@ ; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4 -define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 { +define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -48,7 +48,7 @@ ; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4 -define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 { +define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -71,7 +71,7 @@ ; ALL-LABEL: @occupancy_0( ; CI-NOT: alloca [5 x i32] ; SI: alloca [5 x i32] -define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 { +define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -94,7 +94,7 @@ ; ALL-LABEL: @occupancy_max( ; CI-NOT: alloca [5 x i32] ; SI: alloca [5 x i32] -define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 { +define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -118,7 +118,7 @@ ; CI-LABEL: @occupancy_6( ; SI: alloca ; CI-NOT: alloca -define void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { +define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { entry: %stack = alloca [42 x i8], align 4 %tmp = load i8, i8 addrspace(1)* %in, align 1 @@ -142,7 +142,7 @@ ; ALL-LABEL: @occupancy_6_over( ; ALL: alloca [43 x i8] -define void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { +define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { entry: %stack = alloca [43 x i8], align 4 %tmp = load i8, i8 addrspace(1)* %in, align 1 @@ -168,7 +168,7 @@ ; CI-LABEL: @occupancy_8( ; SI: alloca ; CI-NOT: alloca -define void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { +define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { entry: %stack = alloca [32 x i8], align 4 %tmp = load i8, i8 addrspace(1)* %in, align 1 @@ -192,7 +192,7 @@ ; ALL-LABEL: @occupancy_8_over( ; ALL: alloca [33 x i8] -define void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { +define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { entry: %stack = alloca [33 x i8], align 4 %tmp = load i8, i8 addrspace(1)* %in, align 1 @@ -218,7 +218,7 @@ ; CI-LABEL: @occupancy_9( ; SI: alloca ; CI-NOT: alloca -define void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { +define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { entry: %stack = alloca [28 x i8], align 4 %tmp = load i8, i8 addrspace(1)* %in, align 1 @@ -242,7 +242,7 @@ ; ALL-LABEL: @occupancy_9_over( ; ALL: alloca [29 x i8] -define void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { +define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { entry: %stack = alloca [29 x i8], align 4 %tmp = load i8, i8 addrspace(1)* %in, align 1 Index: test/CodeGen/AMDGPU/lds-alignment.ll =================================================================== --- test/CodeGen/AMDGPU/lds-alignment.ll +++ test/CodeGen/AMDGPU/lds-alignment.ll @@ -15,7 +15,7 @@ ; HSA-LABEL: {{^}}test_no_round_size_1: ; HSA: workgroup_group_segment_byte_size = 38 -define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false) @@ -34,7 +34,7 @@ ; HSA-LABEL: {{^}}test_round_size_2: ; HSA: workgroup_group_segment_byte_size = 86 ; HSA: group_segment_alignment = 4 -define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false) @@ -50,7 +50,7 @@ ; HSA-LABEL: {{^}}test_round_size_2_align_8: ; HSA: workgroup_group_segment_byte_size = 86 ; HSA: group_segment_alignment = 4 -define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) @@ -65,7 +65,7 @@ ; HSA-LABEL: {{^}}test_round_local_lds_and_arg: ; HSA: workgroup_group_segment_byte_size = 38 ; HSA: group_segment_alignment = 4 -define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { +define amdgpu_kernel void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) @@ -78,7 +78,7 @@ ; HSA-LABEL: {{^}}test_round_lds_arg: ; HSA: workgroup_group_segment_byte_size = 0 ; HSA: group_segment_alignment = 4 -define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { +define amdgpu_kernel void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false) ret void @@ -88,7 +88,7 @@ ; HSA-LABEL: {{^}}test_high_align_lds_arg: ; HSA: workgroup_group_segment_byte_size = 0 ; HSA: group_segment_alignment = 4 -define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 { +define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 { call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 64, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 64, i1 false) ret void @@ -98,7 +98,7 @@ ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: ; HSA: workgroup_group_segment_byte_size = 212 ; HSA: group_segment_alignment = 4 -define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false) @@ -114,7 +114,7 @@ ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1: ; HSA: workgroup_group_segment_byte_size = 216 ; HSA: group_segment_alignment = 4 -define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false) @@ -142,7 +142,7 @@ ; HSA-LABEL: {{^}}test_round_size_3_order0: ; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 -define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) @@ -163,7 +163,7 @@ ; HSA-LABEL: {{^}}test_round_size_3_order1: ; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 -define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) @@ -184,7 +184,7 @@ ; HSA-LABEL: {{^}}test_round_size_3_order2: ; HSA: workgroup_group_segment_byte_size = 150 ; HSA: group_segment_alignment = 4 -define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) @@ -205,7 +205,7 @@ ; HSA-LABEL: {{^}}test_round_size_3_order3: ; HSA: workgroup_group_segment_byte_size = 118 ; HSA: group_segment_alignment = 4 -define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) @@ -226,7 +226,7 @@ ; HSA-LABEL: {{^}}test_round_size_3_order4: ; HSA: workgroup_group_segment_byte_size = 142 ; HSA: group_segment_alignment = 4 -define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) @@ -247,7 +247,7 @@ ; HSA-LABEL: {{^}}test_round_size_3_order5: ; HSA: workgroup_group_segment_byte_size = 126 ; HSA: group_segment_alignment = 4 -define void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) Index: test/CodeGen/AMDGPU/lds-initializer.ll =================================================================== --- test/CodeGen/AMDGPU/lds-initializer.ll +++ test/CodeGen/AMDGPU/lds-initializer.ll @@ -5,7 +5,7 @@ @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8] -define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) { +define amdgpu_kernel void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) { %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10 %ld = load i32, i32 addrspace(3)* %gep store i32 %ld, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll =================================================================== --- test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll +++ test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll @@ -18,7 +18,7 @@ ; GCN: BB0_3: ; GCN-NEXT: s_endpgm -define void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 { +define amdgpu_kernel void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 { bb: %tmp = icmp sgt i32 %n, 0 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge Index: test/CodeGen/AMDGPU/lds-oqap-crash.ll =================================================================== --- test/CodeGen/AMDGPU/lds-oqap-crash.ll +++ test/CodeGen/AMDGPU/lds-oqap-crash.ll @@ -10,7 +10,7 @@ ; reads and writes are bundled together in the same instruction. ; CHECK: {{^}}lds_crash: -define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) { entry: %0 = load i32, i32 addrspace(3)* %in ; This block needs to be > 115 ISA instructions to hit the bug, Index: test/CodeGen/AMDGPU/lds-output-queue.ll =================================================================== --- test/CodeGen/AMDGPU/lds-output-queue.ll +++ test/CodeGen/AMDGPU/lds-output-queue.ll @@ -10,7 +10,7 @@ @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4 -define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) { +define amdgpu_kernel void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) { entry: %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index %1 = load i32, i32 addrspace(3)* %0 @@ -88,7 +88,7 @@ ; CHECK: LDS_READ_RET ; CHECK-NOT: ALU clause ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP -define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0 %1 = load i32, i32 addrspace(3)* %0 Index: test/CodeGen/AMDGPU/lds-size.ll =================================================================== --- test/CodeGen/AMDGPU/lds-size.ll +++ test/CodeGen/AMDGPU/lds-size.ll @@ -14,7 +14,7 @@ ; GCN: ; LDSByteSize: 4 bytes/workgroup (compile time only) @lds = internal unnamed_addr addrspace(3) global i32 undef, align 4 -define void @test(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %cond) { entry: %0 = icmp eq i32 %cond, 0 br i1 %0, label %if, label %else Index: test/CodeGen/AMDGPU/lds-zero-initializer.ll =================================================================== --- test/CodeGen/AMDGPU/lds-zero-initializer.ll +++ test/CodeGen/AMDGPU/lds-zero-initializer.ll @@ -5,7 +5,7 @@ @lds = addrspace(3) global [256 x i32] zeroinitializer -define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) { +define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) { %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10 %ld = load i32, i32 addrspace(3)* %gep store i32 %ld, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll =================================================================== --- test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll +++ test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll @@ -11,7 +11,7 @@ ; CHECK: {{^}}setcc_expand: ; CHECK: SET ; CHECK-NOT: CND -define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @setcc_expand(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp eq i32 %in, 5 br i1 %0, label %IF, label %ENDIF Index: test/CodeGen/AMDGPU/literals.ll =================================================================== --- test/CodeGen/AMDGPU/literals.ll +++ test/CodeGen/AMDGPU/literals.ll @@ -10,7 +10,7 @@ ; CHECK: LSHR ; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y ; CHECK-NEXT: 5 -define void @i32_literal(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @i32_literal(i32 addrspace(1)* %out, i32 %in) { entry: %0 = add i32 5, %in store i32 %0, i32 addrspace(1)* %out @@ -27,7 +27,7 @@ ; CHECK: LSHR ; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y ; CHECK-NEXT: 1084227584(5.0 -define void @float_literal(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @float_literal(float addrspace(1)* %out, float %in) { entry: %0 = fadd float 5.0, %in store float %0, float addrspace(1)* %out @@ -41,7 +41,7 @@ ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0 -define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) { entry: store <4 x i32> , <4 x i32> addrspace(1)* %out ret void @@ -52,7 +52,7 @@ ; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0 ; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0 ; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0 -define void @inline_literal_dot4(float addrspace(1)* %out) { +define amdgpu_kernel void @inline_literal_dot4(float addrspace(1)* %out) { entry: %0 = call float @llvm.r600.dot4(<4 x float> , <4 x float> ) store float %0, float addrspace(1)* %out Index: test/CodeGen/AMDGPU/liveness.mir =================================================================== --- test/CodeGen/AMDGPU/liveness.mir +++ test/CodeGen/AMDGPU/liveness.mir @@ -8,7 +8,7 @@ # Should see three distinct value numbers: # CHECK: %vreg0 [{{.*}}:0)[{{.*}}:1)[{{.*}}:2) 0@{{[0-9]+[Berd]}} 1@{{[0-9]+[Berd]}} 2@{{[0-9]+B-phi}} --- | - define void @test0() { ret void } + define amdgpu_kernel void @test0() { ret void } ... --- name: test0 Index: test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll +++ test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll @@ -8,7 +8,7 @@ ; SI: v_bfe_i32 ; EG: BFE_INT ; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac -define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { +define amdgpu_kernel void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -17,7 +17,7 @@ ; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm: ; SI: v_bfe_i32 ; EG: BFE_INT -define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { +define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -26,7 +26,7 @@ ; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg: ; SI: v_bfe_i32 ; EG: BFE_INT -define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { +define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -35,7 +35,7 @@ ; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg: ; SI: v_bfe_i32 ; EG: BFE_INT -define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { +define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -43,7 +43,7 @@ ; FUNC-LABEL: {{^}}v_bfe_print_arg: ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8 -define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind { +define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind { %load = load i32, i32 addrspace(1)* %src0, align 4 %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 @@ -54,7 +54,7 @@ ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { +define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -64,7 +64,7 @@ ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { +define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -74,7 +74,7 @@ ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; SI: s_endpgm -define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31) @@ -88,7 +88,7 @@ ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm -define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31) @@ -100,7 +100,7 @@ ; SI: buffer_load_dword ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 ; SI: s_endpgm -define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) @@ -113,7 +113,7 @@ ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -125,7 +125,7 @@ ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -137,7 +137,7 @@ ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -149,7 +149,7 @@ ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -160,7 +160,7 @@ ; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = ashr i32 %x, 31 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) @@ -171,7 +171,7 @@ ; SI-NOT: lshr ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = lshr i32 %x, 31 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) @@ -184,7 +184,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -196,7 +196,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -208,7 +208,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -220,7 +220,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -232,7 +232,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -244,7 +244,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -256,7 +256,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -268,7 +268,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -280,7 +280,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -292,7 +292,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -304,7 +304,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -316,7 +316,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -328,7 +328,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -340,7 +340,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -352,7 +352,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -364,7 +364,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -376,7 +376,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -388,7 +388,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -400,7 +400,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -412,7 +412,7 @@ ; SI-NOT: v_ashr ; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24 ; SI: buffer_store_dword [[BFE]], -define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24) %shl = shl i32 %bfe, 8 @@ -428,7 +428,7 @@ ; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]] ; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]] ; SI: buffer_store_dword [[TMP2]] -define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %src = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone %div = sdiv i32 %bfe, 2 Index: test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll +++ test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll @@ -8,7 +8,7 @@ ; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg: ; SI: v_bfe_u32 ; EG: BFE_UINT -define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { +define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -17,7 +17,7 @@ ; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm: ; SI: v_bfe_u32 ; EG: BFE_UINT -define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { +define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -26,7 +26,7 @@ ; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg: ; SI: v_bfe_u32 ; EG: BFE_UINT -define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { +define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -35,7 +35,7 @@ ; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg: ; SI: v_bfe_u32 ; EG: BFE_UINT -define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { +define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -45,7 +45,7 @@ ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { +define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -55,7 +55,7 @@ ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { +define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -65,7 +65,7 @@ ; SI: buffer_load_ubyte ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { %load = load i8, i8 addrspace(1)* %in %ext = zext i8 %load to i32 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8) @@ -82,7 +82,7 @@ ; VI-NEXT: v_and_b32_e32 ; SI-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 255 @@ -97,7 +97,7 @@ ; SI-NEXT: v_and_b32_e32 ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 65535 @@ -111,7 +111,7 @@ ; SI: v_add_i32 ; SI: bfe ; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 255 @@ -126,7 +126,7 @@ ; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8 ; SI-NEXT: bfe ; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 255 @@ -141,7 +141,7 @@ ; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80 ; SI-NEXT: bfe ; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 255 @@ -155,7 +155,7 @@ ; SI: v_add_i32 ; SI-NEXT: bfe ; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 65535 @@ -169,14 +169,14 @@ ; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} ; SI: s_endpgm ; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1, -define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1) store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void } -define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8) @@ -184,7 +184,7 @@ ret void } -define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1) @@ -199,7 +199,7 @@ ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm -define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %shr = lshr i32 %shl, 31 @@ -214,7 +214,7 @@ ; SI-NOT: shr ; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1 ; SI: s_endpgm -define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %shr = ashr i32 %shl, 31 @@ -227,7 +227,7 @@ ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; SI: s_endpgm -define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31) @@ -239,7 +239,7 @@ ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31) @@ -252,7 +252,7 @@ ; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) @@ -265,7 +265,7 @@ ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -277,7 +277,7 @@ ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -289,7 +289,7 @@ ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -301,7 +301,7 @@ ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -312,7 +312,7 @@ ; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = ashr i32 %x, 31 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) @@ -323,7 +323,7 @@ ; SI-NOT: lshr ; SI-NOT: {{[^@]}}bfe ; SI: s_endpgm -define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = lshr i32 %x, 31 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) @@ -336,7 +336,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -348,7 +348,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -360,7 +360,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -372,7 +372,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -384,7 +384,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -396,7 +396,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -408,7 +408,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -420,7 +420,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -432,7 +432,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -444,7 +444,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -456,7 +456,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -468,7 +468,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -480,7 +480,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -492,7 +492,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -504,7 +504,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -516,7 +516,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -528,7 +528,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -540,7 +540,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -552,7 +552,7 @@ ; SI: buffer_store_dword [[VREG]], ; SI: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -569,7 +569,7 @@ ; SI-DAG: buffer_store_dword [[AND]] ; SI-DAG: buffer_store_dword [[BFE]] ; SI: s_endpgm -define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, +define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) nounwind { %src = load i32, i32 addrspace(1)* %in, align 4 @@ -583,7 +583,7 @@ ; FUNC-LABEL: {{^}}lshr_and: ; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 ; SI: buffer_store_dword -define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind { %b = lshr i32 %a, 6 %c = and i32 %b, 7 store i32 %c, i32 addrspace(1)* %out, align 8 @@ -593,7 +593,7 @@ ; FUNC-LABEL: {{^}}v_lshr_and: ; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3 ; SI: buffer_store_dword -define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %c = lshr i32 %a, %b %d = and i32 %c, 7 store i32 %d, i32 addrspace(1)* %out, align 8 @@ -603,7 +603,7 @@ ; FUNC-LABEL: {{^}}and_lshr: ; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 ; SI: buffer_store_dword -define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { %b = and i32 %a, 448 %c = lshr i32 %b, 6 store i32 %c, i32 addrspace(1)* %out, align 8 @@ -613,7 +613,7 @@ ; FUNC-LABEL: {{^}}and_lshr2: ; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 ; SI: buffer_store_dword -define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind { %b = and i32 %a, 511 %c = lshr i32 %b, 6 store i32 %c, i32 addrspace(1)* %out, align 8 @@ -623,7 +623,7 @@ ; FUNC-LABEL: {{^}}shl_lshr: ; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002 ; SI: buffer_store_dword -define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { %b = shl i32 %a, 9 %c = lshr i32 %b, 11 store i32 %c, i32 addrspace(1)* %out, align 8 Index: test/CodeGen/AMDGPU/llvm.SI.export.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.export.ll +++ test/CodeGen/AMDGPU/llvm.SI.export.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}test_export_zeroes: ; GCN: exp mrt0 off, off, off, off{{$}} ; GCN: exp mrt0 off, off, off, off done{{$}} -define void @test_export_zeroes() #0 { +define amdgpu_kernel void @test_export_zeroes() #0 { call void @llvm.SI.export(i32 0, i32 0, i32 0, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0) call void @llvm.SI.export(i32 0, i32 0, i32 1, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0) @@ -21,7 +21,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}} -define void @test_export_en_src0() #0 { +define amdgpu_kernel void @test_export_en_src0() #0 { call void @llvm.SI.export(i32 1, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void } @@ -32,7 +32,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}} -define void @test_export_en_src1() #0 { +define amdgpu_kernel void @test_export_en_src1() #0 { call void @llvm.SI.export(i32 2, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void } @@ -43,7 +43,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}} -define void @test_export_en_src2() #0 { +define amdgpu_kernel void @test_export_en_src2() #0 { call void @llvm.SI.export(i32 4, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void } @@ -54,7 +54,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}} -define void @test_export_en_src3() #0 { +define amdgpu_kernel void @test_export_en_src3() #0 { call void @llvm.SI.export(i32 8, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void } @@ -65,7 +65,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}} -define void @test_export_en_src0_src1() #0 { +define amdgpu_kernel void @test_export_en_src0_src1() #0 { call void @llvm.SI.export(i32 3, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void } @@ -76,7 +76,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}} -define void @test_export_en_src0_src2() #0 { +define amdgpu_kernel void @test_export_en_src0_src2() #0 { call void @llvm.SI.export(i32 5, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void } @@ -88,7 +88,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}} ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}} -define void @test_export_en_src0_src3() #0 { +define amdgpu_kernel void @test_export_en_src0_src3() #0 { call void @llvm.SI.export(i32 9, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) call void @llvm.SI.export(i32 9, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void @@ -101,7 +101,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_en_src0_src1_src2_src3() #0 { +define amdgpu_kernel void @test_export_en_src0_src1_src2_src3() #0 { call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void @@ -111,7 +111,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}} ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}} -define void @test_export_mrt7() #0 { +define amdgpu_kernel void @test_export_mrt7() #0 { call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5) call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5) ret void @@ -124,7 +124,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_z() #0 { +define amdgpu_kernel void @test_export_z() #0 { call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void @@ -137,7 +137,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_null() #0 { +define amdgpu_kernel void @test_export_null() #0 { call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void @@ -150,7 +150,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_reserved10() #0 { +define amdgpu_kernel void @test_export_reserved10() #0 { call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void @@ -163,7 +163,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_reserved11() #0 { +define amdgpu_kernel void @test_export_reserved11() #0 { call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void @@ -176,7 +176,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_pos0() #0 { +define amdgpu_kernel void @test_export_pos0() #0 { call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void @@ -189,7 +189,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_pos3() #0 { +define amdgpu_kernel void @test_export_pos3() #0 { call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void @@ -202,7 +202,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_param0() #0 { +define amdgpu_kernel void @test_export_param0() #0 { call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void @@ -215,7 +215,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_param31() #0 { +define amdgpu_kernel void @test_export_param31() #0 { call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void @@ -228,7 +228,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}} ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}} -define void @test_export_vm() #0 { +define amdgpu_kernel void @test_export_vm() #0 { call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -14,7 +14,7 @@ ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] -define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) store i32 %result, i32 addrspace(1)* %out ret void @@ -23,7 +23,7 @@ ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16 -define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42) store i32 %result, i32 addrspace(1)* %out @@ -35,7 +35,7 @@ ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_dec_u32 [[VPTR]], [[DATA]] -define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) ret void } @@ -43,7 +43,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16 -define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42) ret void @@ -52,7 +52,7 @@ ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -define void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42) store i32 %result, i32 addrspace(1)* %out ret void @@ -61,7 +61,7 @@ ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}} -define void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42) store i32 %result, i32 addrspace(1)* %out @@ -70,7 +70,7 @@ ; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32: ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind { +define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind { %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42) ret void } @@ -78,7 +78,7 @@ ; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} -define void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { +define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42) ret void @@ -88,7 +88,7 @@ ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}} ; VI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id @@ -102,7 +102,7 @@ ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} ; VI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 @@ -113,7 +113,7 @@ ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42) store i32 %result, i32 addrspace(4)* %out ret void @@ -122,7 +122,7 @@ ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42) store i32 %result, i32 addrspace(4)* %out @@ -131,7 +131,7 @@ ; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32: ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind { %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42) ret void } @@ -139,7 +139,7 @@ ; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42) ret void @@ -148,7 +148,7 @@ ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset_addr64: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id @@ -161,7 +161,7 @@ ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i32_offset_addr64: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5 @@ -173,7 +173,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42) store i64 %result, i64 addrspace(4)* %out ret void @@ -183,7 +183,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42) store i64 %result, i64 addrspace(4)* %out @@ -194,7 +194,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind { %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42) ret void } @@ -203,7 +203,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42) ret void @@ -213,7 +213,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id @@ -227,7 +227,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5 @@ -240,7 +240,7 @@ ; SI-LABEL: {{^}}atomic_dec_shl_base_lds_0: ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]] offset:8 -define void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0 @@ -254,7 +254,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} -define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42) store i64 %result, i64 addrspace(1)* %out ret void @@ -264,7 +264,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32 -define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42) store i64 %result, i64 addrspace(1)* %out @@ -275,7 +275,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} -define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42) ret void } @@ -284,7 +284,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}} -define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42) ret void @@ -294,7 +294,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -define void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42) store i64 %result, i64 addrspace(1)* %out ret void @@ -304,7 +304,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} -define void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42) store i64 %result, i64 addrspace(1)* %out @@ -315,7 +315,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind { +define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind { %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42) ret void } @@ -324,7 +324,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} -define void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { +define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42) ret void @@ -335,7 +335,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id @@ -350,7 +350,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} -define void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 @@ -363,7 +363,7 @@ ; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64: ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 -define void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -14,7 +14,7 @@ ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] -define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) store i32 %result, i32 addrspace(1)* %out ret void @@ -23,7 +23,7 @@ ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16 -define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42) store i32 %result, i32 addrspace(1)* %out @@ -35,7 +35,7 @@ ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_inc_u32 [[VPTR]], [[DATA]] -define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) ret void } @@ -43,7 +43,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16 -define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42) ret void @@ -52,7 +52,7 @@ ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -define void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42) store i32 %result, i32 addrspace(1)* %out ret void @@ -61,7 +61,7 @@ ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}} -define void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42) store i32 %result, i32 addrspace(1)* %out @@ -70,7 +70,7 @@ ; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32: ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind { +define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind { %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42) ret void } @@ -78,7 +78,7 @@ ; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} -define void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { +define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42) ret void @@ -88,7 +88,7 @@ ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}} ; VI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id @@ -102,7 +102,7 @@ ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} ; VI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 @@ -115,7 +115,7 @@ ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32: ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -define void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0 @@ -129,7 +129,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} -define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42) store i64 %result, i64 addrspace(1)* %out ret void @@ -139,7 +139,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32 -define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42) store i64 %result, i64 addrspace(1)* %out @@ -150,7 +150,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} -define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42) ret void } @@ -159,7 +159,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}} -define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42) ret void @@ -169,7 +169,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -define void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42) store i64 %result, i64 addrspace(1)* %out ret void @@ -179,7 +179,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} -define void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42) store i64 %result, i64 addrspace(1)* %out @@ -190,7 +190,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind { +define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind { %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42) ret void } @@ -199,7 +199,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} -define void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { +define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42) ret void @@ -210,7 +210,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id @@ -225,7 +225,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} -define void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 @@ -236,7 +236,7 @@ ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42) store i32 %result, i32 addrspace(4)* %out ret void @@ -245,7 +245,7 @@ ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42) store i32 %result, i32 addrspace(4)* %out @@ -254,7 +254,7 @@ ; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32: ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind { %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42) ret void } @@ -262,7 +262,7 @@ ; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42) ret void @@ -271,7 +271,7 @@ ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset_addr64: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id @@ -284,7 +284,7 @@ ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i32_offset_addr64: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5 @@ -297,7 +297,7 @@ ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64: ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 -define void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0 @@ -320,7 +320,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42) store i64 %result, i64 addrspace(4)* %out ret void @@ -330,7 +330,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42) store i64 %result, i64 addrspace(4)* %out @@ -341,7 +341,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind { %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42) ret void } @@ -350,7 +350,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42) ret void @@ -360,7 +360,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id @@ -374,7 +374,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5 Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll @@ -8,7 +8,7 @@ ; SI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00] ; VI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00] ; GCN-NEXT: s_endpgm -define void @test_buffer_wbinvl1() #0 { +define amdgpu_kernel void @test_buffer_wbinvl1() #0 { call void @llvm.amdgcn.buffer.wbinvl1() ret void } Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll @@ -6,7 +6,7 @@ ; SI-NEXT: ; BB#0: ; SI-NEXT: buffer_wbinvl1_sc ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00] ; SI-NEXT: s_endpgm -define void @test_buffer_wbinvl1_sc() #0 { +define amdgpu_kernel void @test_buffer_wbinvl1_sc() #0 { call void @llvm.amdgcn.buffer.wbinvl1.sc() ret void } Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll @@ -8,7 +8,7 @@ ; CI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00] ; VI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xfc,0xe0,0x00,0x00,0x00,0x00] ; GCN: s_endpgm -define void @test_buffer_wbinvl1_vol() #0 { +define amdgpu_kernel void @test_buffer_wbinvl1_vol() #0 { call void @llvm.amdgcn.buffer.wbinvl1.vol() ; This used to crash in hazard recognizer store i8 0, i8 addrspace(1)* undef, align 1 Index: test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll @@ -10,7 +10,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @class_f16( +define amdgpu_kernel void @class_f16( i32 addrspace(1)* %r, half addrspace(1)* %a, i32 addrspace(1)* %b) { @@ -31,7 +31,7 @@ ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_fabs( +define amdgpu_kernel void @class_f16_fabs( i32 addrspace(1)* %r, half %a.val, i32 %b.val) { @@ -51,7 +51,7 @@ ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_fneg( +define amdgpu_kernel void @class_f16_fneg( i32 addrspace(1)* %r, half %a.val, i32 %b.val) { @@ -71,7 +71,7 @@ ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_fabs_fneg( +define amdgpu_kernel void @class_f16_fabs_fneg( i32 addrspace(1)* %r, half %a.val, i32 %b.val) { @@ -91,7 +91,7 @@ ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_1( +define amdgpu_kernel void @class_f16_1( i32 addrspace(1)* %r, half %a.val) { entry: @@ -108,7 +108,7 @@ ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_64( +define amdgpu_kernel void @class_f16_64( i32 addrspace(1)* %r, half %a.val) { entry: @@ -126,7 +126,7 @@ ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_full_mask( +define amdgpu_kernel void @class_f16_full_mask( i32 addrspace(1)* %r, half %a.val) { entry: @@ -144,7 +144,7 @@ ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_nine_bit_mask( +define amdgpu_kernel void @class_f16_nine_bit_mask( i32 addrspace(1)* %r, half %a.val) { entry: Index: test/CodeGen/AMDGPU/llvm.amdgcn.class.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -14,7 +14,7 @@ ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -29,7 +29,7 @@ ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %a.fabs = call float @llvm.fabs.f32(float %a) #1 %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1 %sext = sext i1 %result to i32 @@ -45,7 +45,7 @@ ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %a.fneg = fsub float -0.0, %a %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1 %sext = sext i1 %result to i32 @@ -61,7 +61,7 @@ ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %a.fabs = call float @llvm.fabs.f32(float %a) #1 %a.fneg.fabs = fsub float -0.0, %a.fabs %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1 @@ -76,7 +76,7 @@ ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -89,7 +89,7 @@ ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -104,7 +104,7 @@ ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -118,7 +118,7 @@ ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -132,7 +132,7 @@ ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -150,7 +150,7 @@ ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -170,7 +170,7 @@ ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -190,7 +190,7 @@ ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -205,7 +205,7 @@ ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { %a.fabs = call double @llvm.fabs.f64(double %a) #1 %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1 %sext = sext i1 %result to i32 @@ -221,7 +221,7 @@ ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { %a.fneg = fsub double -0.0, %a %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1 %sext = sext i1 %result to i32 @@ -237,7 +237,7 @@ ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { %a.fabs = call double @llvm.fabs.f64(double %a) #1 %a.fneg.fabs = fsub double -0.0, %a.fabs %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1 @@ -249,7 +249,7 @@ ; SI-LABEL: {{^}}test_class_1_f64: ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}} ; SI: s_endpgm -define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 { +define amdgpu_kernel void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -259,7 +259,7 @@ ; SI-LABEL: {{^}}test_class_64_f64: ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}} ; SI: s_endpgm -define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 { +define amdgpu_kernel void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -275,7 +275,7 @@ ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { +define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -290,7 +290,7 @@ ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -306,7 +306,7 @@ ; XSI: v_cmp_class_f64_e32 vcc, 1.0, ; SI: v_cmp_class_f64_e32 vcc, ; SI: s_endpgm -define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -321,7 +321,7 @@ ; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64: ; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} ; SI: s_endpgm -define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -338,7 +338,7 @@ ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}} ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -358,7 +358,7 @@ ; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -381,7 +381,7 @@ ; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}} ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -416,7 +416,7 @@ ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}} ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -436,7 +436,7 @@ ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -456,7 +456,7 @@ ; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}} ; SI: s_or_b64 ; SI: s_endpgm -define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 { +define amdgpu_kernel void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -476,7 +476,7 @@ ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -488,7 +488,7 @@ ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 { +define amdgpu_kernel void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -500,7 +500,7 @@ ; SI-NOT: v_cmp_class ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, ; SI: buffer_store_dword -define void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %result = call i1 @llvm.amdgcn.class.f32(float undef, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll @@ -7,7 +7,7 @@ ; VI: v_cos_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @cos_f16( +define amdgpu_kernel void @cos_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_cos_f32: ; GCN: v_cos_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @v_cos_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @v_cos_f32(float addrspace(1)* %out, float %src) #1 { %cos = call float @llvm.amdgcn.cos.f32(float %src) #0 store float %cos, float addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}test_cubeid: ; GCN: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 { +define amdgpu_kernel void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 { %result = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c) store float %result, float addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}test_cubema: ; GCN: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 { +define amdgpu_kernel void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 { %result = call float @llvm.amdgcn.cubema(float %a, float %b, float %c) store float %result, float addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}test_cubesc: ; GCN: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 { +define amdgpu_kernel void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 { %result = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c) store float %result, float addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}test_cubetc: ; GCN: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 { +define amdgpu_kernel void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 { %result = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c) store float %result, float addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -8,7 +8,7 @@ ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]] ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]] ; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[VY]] -define void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { +define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) store <2 x half> %result, <2 x half> addrspace(1)* %out ret void @@ -17,7 +17,7 @@ ; GCN-LABEL: {{^}}s_cvt_pkrtz_samereg_v2f16_f32: ; GCN: s_load_dword [[X:s[0-9]+]] ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[X]] -define void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 { %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) store <2 x half> %result, <2 x half> addrspace(1)* %out ret void @@ -29,7 +29,7 @@ ; SI-NEXT: s_endpgm ; VI-NEXT: s_endpgm ; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -define void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 { %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef) store <2 x half> %result, <2 x half> addrspace(1)* %out ret void @@ -40,7 +40,7 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[A]], [[B]] ; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], [[B]] -define void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -56,7 +56,7 @@ ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_reg_imm: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], 1.0 -define void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -71,7 +71,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, 1.0, [[A]] ; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, 1.0, [[A]] -define void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -86,7 +86,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], [[B]] -define void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -104,7 +104,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], -[[B]] -define void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -122,7 +122,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], -[[B]] -define void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -141,7 +141,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -|[[A]]|, -[[B]] -define void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext Index: test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll @@ -9,7 +9,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @dispatch_id(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @dispatch_id(i64 addrspace(1)* %out) #0 { %tmp0 = call i64 @llvm.amdgcn.dispatch.id() store i64 %tmp0, i64 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}test: ; GCN: enable_sgpr_dispatch_ptr = 1 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -define void @test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out) { %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* %value = load i32, i32 addrspace(2)* %header_ptr Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll @@ -9,7 +9,7 @@ ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16( +define amdgpu_kernel void @div_fixup_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -30,7 +30,7 @@ ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16_imm_a( +define amdgpu_kernel void @div_fixup_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b, half addrspace(1)* %c) { @@ -49,7 +49,7 @@ ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16_imm_b( +define amdgpu_kernel void @div_fixup_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %c) { @@ -68,7 +68,7 @@ ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16_imm_c( +define amdgpu_kernel void @div_fixup_f16_imm_c( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -86,7 +86,7 @@ ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AB_F16]], v[[AB_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16_imm_a_imm_b( +define amdgpu_kernel void @div_fixup_f16_imm_a_imm_b( half addrspace(1)* %r, half addrspace(1)* %c) { entry: @@ -102,7 +102,7 @@ ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[BC_F16]], v[[BC_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16_imm_b_imm_c( +define amdgpu_kernel void @div_fixup_f16_imm_b_imm_c( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -118,7 +118,7 @@ ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AC_F16]], v[[B_F16]], v[[AC_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16_imm_a_imm_c( +define amdgpu_kernel void @div_fixup_f16_imm_a_imm_c( half addrspace(1)* %r, half addrspace(1)* %b) { entry: Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll @@ -16,7 +16,7 @@ ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { +define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { %result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -24,7 +24,7 @@ ; GCN-LABEL: {{^}}test_div_fixup_f64: ; GCN: v_div_fixup_f64 -define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { +define amdgpu_kernel void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { %result = call double @llvm.amdgcn.div.fixup.f64(double %a, double %b, double %c) nounwind readnone store double %result, double addrspace(1)* %out, align 8 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -20,7 +20,7 @@ ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -34,7 +34,7 @@ ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -48,7 +48,7 @@ ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -62,7 +62,7 @@ ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0 ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -70,7 +70,7 @@ ; GCN-LABEL: {{^}}test_div_fmas_f64: ; GCN: v_div_fmas_f64 -define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone store double %result, double addrspace(1)* %out, align 8 ret void @@ -79,7 +79,7 @@ ; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc: ; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind { %cmp = icmp eq i32 %i, 0 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone store float %result, float addrspace(1)* %out, align 4 @@ -89,7 +89,7 @@ ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc: ; SI: s_mov_b64 vcc, 0 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -98,7 +98,7 @@ ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc: ; SI: s_mov_b64 vcc, -1 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -114,7 +114,7 @@ ; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]] ; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]] ; SI: s_endpgm -define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 @@ -150,7 +150,7 @@ ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; SI: buffer_store_dword ; SI: s_endpgm -define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll @@ -11,7 +11,7 @@ ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -31,7 +31,7 @@ ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -51,7 +51,7 @@ ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -71,7 +71,7 @@ ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -91,7 +91,7 @@ ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { +define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -109,7 +109,7 @@ ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { +define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -127,7 +127,7 @@ ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { +define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -145,7 +145,7 @@ ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { +define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -163,7 +163,7 @@ ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { +define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -181,7 +181,7 @@ ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { +define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -199,7 +199,7 @@ ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { +define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -217,7 +217,7 @@ ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { +define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -236,7 +236,7 @@ ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind { +define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone %result0 = extractvalue { float, i1 } %result, 0 store float %result0, float addrspace(1)* %out, align 4 @@ -250,7 +250,7 @@ ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind { +define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone %result0 = extractvalue { float, i1 } %result, 0 store float %result0, float addrspace(1)* %out, align 4 @@ -265,7 +265,7 @@ ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}} ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind { +define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind { %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone %result0 = extractvalue { double, i1 } %result, 0 store double %result0, double addrspace(1)* %out, align 8 @@ -280,7 +280,7 @@ ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind { +define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind { %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone %result0 = extractvalue { double, i1 } %result, 0 store double %result0, double addrspace(1)* %out, align 8 @@ -292,7 +292,7 @@ ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0 ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %a = load float, float addrspace(1)* %gep.0, align 4 @@ -308,7 +308,7 @@ ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %a = load float, float addrspace(1)* %gep.0, align 4 @@ -326,7 +326,7 @@ ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[ABS_A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -349,7 +349,7 @@ ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[ABS_B]], [[ABS_B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}ds_bpermute: ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CHECK: s_waitcnt lgkmcnt -define void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind { +define amdgpu_kernel void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind { %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0 store i32 %bpermute, i32 addrspace(1)* %out, align 4 ret void @@ -14,7 +14,7 @@ ; CHECK-LABEL: {{^}}ds_bpermute_imm_offset: ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4 ; CHECK: s_waitcnt lgkmcnt -define void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { +define amdgpu_kernel void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { %index = add i32 %base_index, 4 %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0 store i32 %bpermute, i32 addrspace(1)* %out, align 4 @@ -24,7 +24,7 @@ ; CHECK-LABEL: {{^}}ds_bpermute_imm_index: ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64 ; CHECK: s_waitcnt lgkmcnt -define void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { +define amdgpu_kernel void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0 store i32 %bpermute, i32 addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: {{^}}ds_permute: ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CHECK: s_waitcnt lgkmcnt -define void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind { +define amdgpu_kernel void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind { %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0 store i32 %bpermute, i32 addrspace(1)* %out, align 4 ret void @@ -14,7 +14,7 @@ ; CHECK-LABEL: {{^}}ds_permute_imm_offset: ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4 ; CHECK: s_waitcnt lgkmcnt -define void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { +define amdgpu_kernel void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { %index = add i32 %base_index, 4 %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0 store i32 %bpermute, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll @@ -6,7 +6,7 @@ ; FUNC-LABEL: {{^}}ds_swizzle: ; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:100 ; CHECK: s_waitcnt lgkmcnt -define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind { +define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind { %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 store i32 %swizzle, i32 addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll @@ -8,7 +8,7 @@ ; GCN-LABEL: {{^}}test_export_compr_zeroes_v2f16: ; GCN: exp mrt0 off, off, off, off compr{{$}} ; GCN: exp mrt0 off, off, off, off done compr{{$}} -define void @test_export_compr_zeroes_v2f16() #0 { +define amdgpu_kernel void @test_export_compr_zeroes_v2f16() #0 { call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> zeroinitializer, <2 x half> zeroinitializer, i1 false, i1 false) call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> zeroinitializer, <2 x half> zeroinitializer, i1 true, i1 false) ret void @@ -18,7 +18,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], off, off done compr{{$}} -define void @test_export_compr_en_src0_v2f16() #0 { +define amdgpu_kernel void @test_export_compr_en_src0_v2f16() #0 { call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> , <2 x half> , i1 true, i1 false) ret void } @@ -27,7 +27,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 ; GCN: exp mrt0 off, off, [[SRC1]], [[SRC1]] done compr{{$}} -define void @test_export_compr_en_src1_v2f16() #0 { +define amdgpu_kernel void @test_export_compr_en_src1_v2f16() #0 { call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> , <2 x half> , i1 true, i1 false) ret void } @@ -36,7 +36,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}} -define void @test_export_compr_en_src0_src1_v2f16() #0 { +define amdgpu_kernel void @test_export_compr_en_src0_src1_v2f16() #0 { call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> , <2 x half> , i1 true, i1 false) ret void } @@ -45,7 +45,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 ; GCN: exp mrt0 off, [[SRC0]], off, off done compr{{$}} -define void @test_export_compr_en_invalid2_v2f16() #0 { +define amdgpu_kernel void @test_export_compr_en_invalid2_v2f16() #0 { call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> , <2 x half> , i1 true, i1 false) ret void } @@ -54,7 +54,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 ; GCN: exp mrt0 off, [[SRC0]], off, [[SRC1]] done compr{{$}} -define void @test_export_compr_en_invalid10_v2f16() #0 { +define amdgpu_kernel void @test_export_compr_en_invalid10_v2f16() #0 { call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 10, <2 x half> , <2 x half> , i1 true, i1 false) ret void } @@ -63,7 +63,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0x38003800 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] compr{{$}} ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done compr{{$}} -define void @test_export_compr_mrt7_v2f16() #0 { +define amdgpu_kernel void @test_export_compr_mrt7_v2f16() #0 { call void @llvm.amdgcn.exp.compr.v2f16(i32 7, i32 15, <2 x half> , <2 x half> , i1 false, i1 false) call void @llvm.amdgcn.exp.compr.v2f16(i32 7, i32 15, <2 x half> , <2 x half> , i1 true, i1 false) ret void @@ -74,7 +74,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 ; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr{{$}} ; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}} -define void @test_export_compr_z_v2f16() #0 { +define amdgpu_kernel void @test_export_compr_z_v2f16() #0 { call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 15, <2 x half> , <2 x half> , i1 false, i1 false) call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 15, <2 x half> , <2 x half> , i1 true, i1 false) ret void @@ -85,7 +85,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr vm{{$}} ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr vm{{$}} -define void @test_export_compr_vm_v2f16() #0 { +define amdgpu_kernel void @test_export_compr_vm_v2f16() #0 { call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> , <2 x half> , i1 false, i1 true) call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> , <2 x half> , i1 true, i1 true) ret void @@ -94,7 +94,7 @@ ; GCN-LABEL: {{^}}test_export_compr_zeroes_v2i16: ; GCN: exp mrt0 off, off, off, off compr{{$}} ; GCN: exp mrt0 off, off, off, off done compr{{$}} -define void @test_export_compr_zeroes_v2i16() #0 { +define amdgpu_kernel void @test_export_compr_zeroes_v2i16() #0 { call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 0, <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i1 false, i1 false) call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 0, <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i1 true, i1 false) ret void @@ -104,7 +104,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005 ; GCN: exp mrt0 [[SRC0]], off, off, off done compr{{$}} -define void @test_export_compr_en_src0_v2i16() #0 { +define amdgpu_kernel void @test_export_compr_en_src0_v2i16() #0 { call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 1, <2 x i16> , <2 x i16> , i1 true, i1 false) ret void } @@ -113,7 +113,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005 ; GCN: exp mrt0 off, off, [[SRC1]], [[SRC1]] done compr{{$}} -define void @test_export_compr_en_src1_v2i16() #0 { +define amdgpu_kernel void @test_export_compr_en_src1_v2i16() #0 { call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 12, <2 x i16> , <2 x i16> , i1 true, i1 false) ret void } @@ -122,7 +122,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}} -define void @test_export_compr_en_src0_src1_v2i16() #0 { +define amdgpu_kernel void @test_export_compr_en_src0_src1_v2i16() #0 { call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> , <2 x i16> , i1 true, i1 false) ret void } @@ -131,7 +131,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VI16:v[0-9]+]], 0x50005 ; GCN: exp mrt7 [[VI16]], [[VI16]], [[VI16]], [[VI16]] compr{{$}} ; GCN: exp mrt7 [[VI16]], [[VI16]], [[VI16]], [[VI16]] done compr{{$}} -define void @test_export_compr_mrt7_v2i16() #0 { +define amdgpu_kernel void @test_export_compr_mrt7_v2i16() #0 { call void @llvm.amdgcn.exp.compr.v2i16(i32 7, i32 15, <2 x i16> , <2 x i16> , i1 false, i1 false) call void @llvm.amdgcn.exp.compr.v2i16(i32 7, i32 15, <2 x i16> , <2 x i16> , i1 true, i1 false) ret void @@ -142,7 +142,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005 ; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr{{$}} ; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}} -define void @test_export_compr_z_v2i16() #0 { +define amdgpu_kernel void @test_export_compr_z_v2i16() #0 { call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 15, <2 x i16> , <2 x i16> , i1 false, i1 false) call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 15, <2 x i16> , <2 x i16> , i1 true, i1 false) ret void @@ -153,7 +153,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr vm{{$}} ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr vm{{$}} -define void @test_export_compr_vm_v2i16() #0 { +define amdgpu_kernel void @test_export_compr_vm_v2i16() #0 { call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> , <2 x i16> , i1 false, i1 true) call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> , <2 x i16> , i1 true, i1 true) ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll @@ -7,7 +7,7 @@ ; GCN-LABEL: {{^}}test_export_zeroes_f32: ; GCN: exp mrt0 off, off, off, off{{$}} ; GCN: exp mrt0 off, off, off, off done{{$}} -define void @test_export_zeroes_f32() #0 { +define amdgpu_kernel void @test_export_zeroes_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 true, i1 false) @@ -22,7 +22,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}} -define void @test_export_en_src0_f32() #0 { +define amdgpu_kernel void @test_export_en_src0_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void } @@ -33,7 +33,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}} -define void @test_export_en_src1_f32() #0 { +define amdgpu_kernel void @test_export_en_src1_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void } @@ -44,7 +44,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}} -define void @test_export_en_src2_f32() #0 { +define amdgpu_kernel void @test_export_en_src2_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void } @@ -55,7 +55,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}} -define void @test_export_en_src3_f32() #0 { +define amdgpu_kernel void @test_export_en_src3_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void } @@ -66,7 +66,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}} -define void @test_export_en_src0_src1_f32() #0 { +define amdgpu_kernel void @test_export_en_src0_src1_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void } @@ -77,7 +77,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}} -define void @test_export_en_src0_src2_f32() #0 { +define amdgpu_kernel void @test_export_en_src0_src2_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void } @@ -89,7 +89,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}} ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}} -define void @test_export_en_src0_src3_f32() #0 { +define amdgpu_kernel void @test_export_en_src0_src3_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void @@ -102,7 +102,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_en_src0_src1_src2_src3_f32() #0 { +define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void @@ -112,7 +112,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}} ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}} -define void @test_export_mrt7_f32() #0 { +define amdgpu_kernel void @test_export_mrt7_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 true, i1 false) ret void @@ -125,7 +125,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_z_f32() #0 { +define amdgpu_kernel void @test_export_z_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void @@ -138,7 +138,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_null_f32() #0 { +define amdgpu_kernel void @test_export_null_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void @@ -151,7 +151,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_reserved10_f32() #0 { +define amdgpu_kernel void @test_export_reserved10_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void @@ -164,7 +164,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_reserved11_f32() #0 { +define amdgpu_kernel void @test_export_reserved11_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void @@ -177,7 +177,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_pos0_f32() #0 { +define amdgpu_kernel void @test_export_pos0_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void @@ -190,7 +190,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_pos3_f32() #0 { +define amdgpu_kernel void @test_export_pos3_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void @@ -203,7 +203,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_param0_f32() #0 { +define amdgpu_kernel void @test_export_param0_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void @@ -216,7 +216,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_param31_f32() #0 { +define amdgpu_kernel void @test_export_param31_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) ret void @@ -229,7 +229,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}} ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}} -define void @test_export_vm_f32() #0 { +define amdgpu_kernel void @test_export_vm_f32() #0 { call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 true) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 true) ret void @@ -252,7 +252,7 @@ ; GCN-LABEL: {{^}}test_export_zeroes_i32: ; GCN: exp mrt0 off, off, off, off{{$}} ; GCN: exp mrt0 off, off, off, off done{{$}} -define void @test_export_zeroes_i32() #0 { +define amdgpu_kernel void @test_export_zeroes_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 true, i1 false) @@ -267,7 +267,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}} -define void @test_export_en_src0_i32() #0 { +define amdgpu_kernel void @test_export_en_src0_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 0, i32 1, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void } @@ -278,7 +278,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}} -define void @test_export_en_src1_i32() #0 { +define amdgpu_kernel void @test_export_en_src1_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 0, i32 2, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void } @@ -289,7 +289,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}} -define void @test_export_en_src2_i32() #0 { +define amdgpu_kernel void @test_export_en_src2_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 0, i32 4, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void } @@ -300,7 +300,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}} -define void @test_export_en_src3_i32() #0 { +define amdgpu_kernel void @test_export_en_src3_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 0, i32 8, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void } @@ -311,7 +311,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}} -define void @test_export_en_src0_src1_i32() #0 { +define amdgpu_kernel void @test_export_en_src0_src1_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void } @@ -322,7 +322,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}} -define void @test_export_en_src0_src2_i32() #0 { +define amdgpu_kernel void @test_export_en_src0_src2_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 0, i32 5, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void } @@ -334,7 +334,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}} ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}} -define void @test_export_en_src0_src3_i32() #0 { +define amdgpu_kernel void @test_export_en_src0_src3_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void @@ -347,7 +347,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_en_src0_src1_src2_src3_i32() #0 { +define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void @@ -357,7 +357,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 5 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}} ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}} -define void @test_export_mrt7_i32() #0 { +define amdgpu_kernel void @test_export_mrt7_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 false, i1 false) call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 true, i1 false) ret void @@ -370,7 +370,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_z_i32() #0 { +define amdgpu_kernel void @test_export_z_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void @@ -383,7 +383,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_null_i32() #0 { +define amdgpu_kernel void @test_export_null_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void @@ -396,7 +396,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_reserved10_i32() #0 { +define amdgpu_kernel void @test_export_reserved10_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void @@ -409,7 +409,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_reserved11_i32() #0 { +define amdgpu_kernel void @test_export_reserved11_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void @@ -422,7 +422,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_pos0_i32() #0 { +define amdgpu_kernel void @test_export_pos0_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void @@ -435,7 +435,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_pos3_i32() #0 { +define amdgpu_kernel void @test_export_pos3_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void @@ -448,7 +448,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_param0_i32() #0 { +define amdgpu_kernel void @test_export_param0_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void @@ -461,7 +461,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_param31_i32() #0 { +define amdgpu_kernel void @test_export_param31_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) ret void @@ -474,7 +474,7 @@ ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}} ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}} -define void @test_export_vm_i32() #0 { +define amdgpu_kernel void @test_export_vm_i32() #0 { call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 true) call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 true) ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll @@ -7,7 +7,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_dynamic_cc: ; GCN: s_endpgm -define void @v_fcmp_f32_dynamic_cc(i64 addrspace(1)* %out, float %src0, float %src1, i32 %cc) { +define amdgpu_kernel void @v_fcmp_f32_dynamic_cc(i64 addrspace(1)* %out, float %src0, float %src1, i32 %cc) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src0, float %src1, i32 %cc) store i64 %result, i64 addrspace(1)* %out ret void @@ -15,7 +15,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_with_fabs: ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, {{s[0-9]+}} -define void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) { +define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) { %temp = call float @llvm.fabs.f32(float %a) %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1) store i64 %result, i64 addrspace(1)* %out @@ -24,7 +24,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_both_operands_with_fabs: ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, |{{s[0-9]+}}| -define void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) { +define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) { %temp = call float @llvm.fabs.f32(float %a) %src_input = call float @llvm.fabs.f32(float %src) %result = call i64 @llvm.amdgcn.fcmp.f32(float %src_input, float %temp, i32 1) @@ -34,7 +34,7 @@ ; GCN-LABEL: {{^}}v_fcmp: ; GCN-NOT: v_cmp_eq_f32_e64 -define void @v_fcmp(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1) store i64 %result, i64 addrspace(1)* %out ret void @@ -42,7 +42,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_oeq: ; GCN: v_cmp_eq_f32_e64 -define void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1) store i64 %result, i64 addrspace(1)* %out ret void @@ -50,7 +50,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_one: ; GCN: v_cmp_neq_f32_e64 -define void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6) store i64 %result, i64 addrspace(1)* %out ret void @@ -58,7 +58,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_ogt: ; GCN: v_cmp_gt_f32_e64 -define void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2) store i64 %result, i64 addrspace(1)* %out ret void @@ -66,7 +66,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_oge: ; GCN: v_cmp_ge_f32_e64 -define void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3) store i64 %result, i64 addrspace(1)* %out ret void @@ -74,7 +74,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_olt: ; GCN: v_cmp_lt_f32_e64 -define void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4) store i64 %result, i64 addrspace(1)* %out ret void @@ -82,7 +82,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_ole: ; GCN: v_cmp_le_f32_e64 -define void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5) store i64 %result, i64 addrspace(1)* %out ret void @@ -91,7 +91,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_ueq: ; GCN: v_cmp_nlg_f32_e64 -define void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9) store i64 %result, i64 addrspace(1)* %out ret void @@ -99,7 +99,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_une: ; GCN: v_cmp_neq_f32_e64 -define void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14) store i64 %result, i64 addrspace(1)* %out ret void @@ -107,7 +107,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_ugt: ; GCN: v_cmp_nle_f32_e64 -define void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10) store i64 %result, i64 addrspace(1)* %out ret void @@ -115,7 +115,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_uge: ; GCN: v_cmp_nlt_f32_e64 -define void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11) store i64 %result, i64 addrspace(1)* %out ret void @@ -123,7 +123,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_ult: ; GCN: v_cmp_nge_f32_e64 -define void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12) store i64 %result, i64 addrspace(1)* %out ret void @@ -131,7 +131,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f32_ule: ; GCN: v_cmp_ngt_f32_e64 -define void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13) store i64 %result, i64 addrspace(1)* %out ret void @@ -139,7 +139,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f64_oeq: ; GCN: v_cmp_eq_f64_e64 -define void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1) store i64 %result, i64 addrspace(1)* %out ret void @@ -147,7 +147,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f64_one: ; GCN: v_cmp_neq_f64_e64 -define void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6) store i64 %result, i64 addrspace(1)* %out ret void @@ -155,7 +155,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f64_ogt: ; GCN: v_cmp_gt_f64_e64 -define void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2) store i64 %result, i64 addrspace(1)* %out ret void @@ -163,7 +163,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f64_oge: ; GCN: v_cmp_ge_f64_e64 -define void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3) store i64 %result, i64 addrspace(1)* %out ret void @@ -171,7 +171,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f64_olt: ; GCN: v_cmp_lt_f64_e64 -define void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4) store i64 %result, i64 addrspace(1)* %out ret void @@ -179,7 +179,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f64_ole: ; GCN: v_cmp_le_f64_e64 -define void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5) store i64 %result, i64 addrspace(1)* %out ret void @@ -187,7 +187,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f64_ueq: ; GCN: v_cmp_nlg_f64_e64 -define void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9) store i64 %result, i64 addrspace(1)* %out ret void @@ -195,7 +195,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f64_une: ; GCN: v_cmp_neq_f64_e64 -define void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14) store i64 %result, i64 addrspace(1)* %out ret void @@ -203,7 +203,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f64_ugt: ; GCN: v_cmp_nle_f64_e64 -define void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10) store i64 %result, i64 addrspace(1)* %out ret void @@ -211,7 +211,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f64_uge: ; GCN: v_cmp_nlt_f64_e64 -define void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11) store i64 %result, i64 addrspace(1)* %out ret void @@ -219,7 +219,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f64_ult: ; GCN: v_cmp_nge_f64_e64 -define void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12) store i64 %result, i64 addrspace(1)* %out ret void @@ -227,7 +227,7 @@ ; GCN-LABEL: {{^}}v_fcmp_f64_ule: ; GCN: v_cmp_ngt_f64_e64 -define void @v_fcmp_f64_ule(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_ule(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13) store i64 %result, i64 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll @@ -8,7 +8,7 @@ ; CHECK: v_rcp_f32_e32 ; CHECK: v_mul_f32_e32 ; CHECK: v_mul_f32_e32 -define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 { +define amdgpu_kernel void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 { %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b) store float %fdiv, float addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll @@ -2,7 +2,7 @@ ; GCN-LABEL: {{^}}test_fmed3_f16: ; GCN: v_med3_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 { +define amdgpu_kernel void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 { %src0.f16 = trunc i32 %src0.arg to i16 %src0 = bitcast i16 %src0.f16 to half %src1.f16 = trunc i32 %src1.arg to i16 @@ -16,7 +16,7 @@ ; GCN-LABEL: {{^}}test_fmed3_srcmods_f16: ; GCN: v_med3_f16 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}| -define void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 { +define amdgpu_kernel void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 { %src0.f16 = trunc i32 %src0.arg to i16 %src0 = bitcast i16 %src0.f16 to half %src1.f16 = trunc i32 %src1.arg to i16 Index: test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}test_fmed3: ; GCN: v_med3_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @test_fmed3(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 { +define amdgpu_kernel void @test_fmed3(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 { %mad = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float %src2) store float %mad, float addrspace(1)* %out ret void @@ -11,7 +11,7 @@ ; GCN-LABEL: {{^}}test_fmed3_srcmods: ; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}| -define void @test_fmed3_srcmods(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 { +define amdgpu_kernel void @test_fmed3_srcmods(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 { %src0.fneg = fsub float -0.0, %src0 %src1.fabs = call float @llvm.fabs.f32(float %src1) %src2.fabs = call float @llvm.fabs.f32(float %src2) Index: test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}test_mul_legacy_f32: ; GCN: v_mul_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -define void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 { %result = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) store float %result, float addrspace(1)* %out, align 4 ret void @@ -12,7 +12,7 @@ ; GCN-LABEL: {{^}}test_mul_legacy_undef0_f32: ; GCN: v_mul_legacy_f32_e32 -define void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 { %result = call float @llvm.amdgcn.fmul.legacy(float undef, float %a) store float %result, float addrspace(1)* %out, align 4 ret void @@ -20,7 +20,7 @@ ; GCN-LABEL: {{^}}test_mul_legacy_undef1_f32: ; GCN: v_mul_legacy_f32_e32 -define void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 { %result = call float @llvm.amdgcn.fmul.legacy(float %a, float undef) store float %result, float addrspace(1)* %out, align 4 ret void @@ -28,7 +28,7 @@ ; GCN-LABEL: {{^}}test_mul_legacy_fabs_f32: ; GCN: v_mul_legacy_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |s{{[0-9]+}}| -define void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 { %a.fabs = call float @llvm.fabs.f32(float %a) %b.fabs = call float @llvm.fabs.f32(float %b) %result = call float @llvm.amdgcn.fmul.legacy(float %a.fabs, float %b.fabs) @@ -40,7 +40,7 @@ ; GCN-LABEL: {{^}}test_mad_legacy_f32: ; GCN: v_mul_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_add_f32_e32 -define void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 { +define amdgpu_kernel void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 { %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) %add = fadd float %mul, %c store float %add, float addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll @@ -7,7 +7,7 @@ ; VI: v_fract_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fract_f16( +define amdgpu_kernel void @fract_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}v_fract_f32: ; GCN: v_fract_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @v_fract_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @v_fract_f32(float addrspace(1)* %out, float %src) #1 { %fract = call float @llvm.amdgcn.fract.f32(float %src) store float %fract, float addrspace(1)* %out ret void @@ -14,7 +14,7 @@ ; GCN-LABEL: {{^}}v_fract_f64: ; GCN: v_fract_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @v_fract_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @v_fract_f64(double addrspace(1)* %out, double %src) #1 { %fract = call double @llvm.amdgcn.fract.f64(double %src) store double %fract, double addrspace(1)* %out ret void @@ -23,7 +23,7 @@ ; GCN-LABEL: {{^}}v_fract_undef_f32: ; GCN-NOT: v_fract_f32 ; GCN-NOT: store_dword -define void @v_fract_undef_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_fract_undef_f32(float addrspace(1)* %out) #1 { %fract = call float @llvm.amdgcn.fract.f32(float undef) store float %fract, float addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll @@ -6,7 +6,7 @@ ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; VI: v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_I16]] -define void @frexp_exp_f16( +define amdgpu_kernel void @frexp_exp_f16( i16 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -21,7 +21,7 @@ ; VI: v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] ; VI: v_bfe_i32 v[[R_I32:[0-9]+]], v[[R_I16]], 0, 16{{$}} ; GCN: buffer_store_dword v[[R_I32]] -define void @frexp_exp_f16_sext( +define amdgpu_kernel void @frexp_exp_f16_sext( i32 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -37,7 +37,7 @@ ; VI: v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] ; VI: v_and_b32_e32 v[[R_I32:[0-9]+]], 0xffff, v[[R_I16]] ; GCN: buffer_store_dword v[[R_I32]] -define void @frexp_exp_f16_zext( +define amdgpu_kernel void @frexp_exp_f16_zext( i32 addrspace(1)* %r, half addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll @@ -8,7 +8,7 @@ ; GCN-LABEL: {{^}}s_test_frexp_exp_f32: ; GCN: v_frexp_exp_i32_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %src) store i32 %frexp.exp, i32 addrspace(1)* %out ret void @@ -16,7 +16,7 @@ ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f32: ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}| -define void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { %fabs.src = call float @llvm.fabs.f32(float %src) %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fabs.src) store i32 %frexp.exp, i32 addrspace(1)* %out @@ -25,7 +25,7 @@ ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f32: ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}| -define void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { %fabs.src = call float @llvm.fabs.f32(float %src) %fneg.fabs.src = fsub float -0.0, %fabs.src %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fneg.fabs.src) @@ -35,7 +35,7 @@ ; GCN-LABEL: {{^}}s_test_frexp_exp_f64: ; GCN: v_frexp_exp_i32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} -define void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %src) store i32 %frexp.exp, i32 addrspace(1)* %out ret void @@ -43,7 +43,7 @@ ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f64: ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, |{{s\[[0-9]+:[0-9]+\]}}| -define void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { %fabs.src = call double @llvm.fabs.f64(double %src) %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fabs.src) store i32 %frexp.exp, i32 addrspace(1)* %out @@ -52,7 +52,7 @@ ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f64: ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, -|{{s\[[0-9]+:[0-9]+\]}}| -define void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { %fabs.src = call double @llvm.fabs.f64(double %src) %fneg.fabs.src = fsub double -0.0, %fabs.src %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fneg.fabs.src) Index: test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll @@ -7,7 +7,7 @@ ; VI: v_frexp_mant_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @frexp_mant_f16( +define amdgpu_kernel void @frexp_mant_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll @@ -8,7 +8,7 @@ ; GCN-LABEL: {{^}}s_test_frexp_mant_f32: ; GCN: v_frexp_mant_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 { %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %src) store float %frexp.mant, float addrspace(1)* %out ret void @@ -16,7 +16,7 @@ ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f32: ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}| -define void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 { %fabs.src = call float @llvm.fabs.f32(float %src) %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fabs.src) store float %frexp.mant, float addrspace(1)* %out @@ -25,7 +25,7 @@ ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f32: ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}| -define void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 { %fabs.src = call float @llvm.fabs.f32(float %src) %fneg.fabs.src = fsub float -0.0, %fabs.src %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fneg.fabs.src) @@ -35,7 +35,7 @@ ; GCN-LABEL: {{^}}s_test_frexp_mant_f64: ; GCN: v_frexp_mant_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 { %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %src) store double %frexp.mant, double addrspace(1)* %out ret void @@ -43,7 +43,7 @@ ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f64: ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, |{{s\[[0-9]+:[0-9]+\]}}| -define void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 { %fabs.src = call double @llvm.fabs.f64(double %src) %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fabs.src) store double %frexp.mant, double addrspace(1)* %out @@ -52,7 +52,7 @@ ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f64: ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, -|{{s\[[0-9]+:[0-9]+\]}}| -define void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 { %fabs.src = call double @llvm.fabs.f64(double %src) %fneg.fabs.src = fsub double -0.0, %fabs.src %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fneg.fabs.src) Index: test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll @@ -9,7 +9,7 @@ ; CHECK-LABEL: {{^}}groupstaticsize_test0: ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}} -define void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 { +define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 64 %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1 @@ -23,7 +23,7 @@ ; CHECK-LABEL: {{^}}groupstaticsize_test1: ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}} -define void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) { +define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) { entry: %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1 store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4 @@ -51,7 +51,7 @@ ; Exceeds 16-bit simm limit of s_movk_i32 ; CHECK-LABEL: {{^}}large_groupstaticsize: ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}} -define void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 { +define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 { %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx store volatile i32 0, i32 addrspace(3)* %gep %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() Index: test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll @@ -7,7 +7,7 @@ ; No crash on invalid input ; GCN-LABEL: {{^}}v_icmp_i32_dynamic_cc: ; GCN: s_endpgm -define void @v_icmp_i32_dynamic_cc(i64 addrspace(1)* %out, i32 %src, i32 %cc) { +define amdgpu_kernel void @v_icmp_i32_dynamic_cc(i64 addrspace(1)* %out, i32 %src, i32 %cc) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 %cc) store i64 %result, i64 addrspace(1)* %out ret void @@ -15,7 +15,7 @@ ; GCN-LABEL: {{^}}v_icmp_i32_eq: ; GCN: v_cmp_eq_u32_e64 -define void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32) store i64 %result, i64 addrspace(1)* %out ret void @@ -23,14 +23,14 @@ ; GCN-LABEL: {{^}}v_icmp: ; GCN-NOT: v_cmp_eq_u32_e64 -define void @v_icmp(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30) store i64 %result, i64 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}v_icmp_i32_ne: ; GCN: v_cmp_ne_u32_e64 -define void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33) store i64 %result, i64 addrspace(1)* %out ret void @@ -38,7 +38,7 @@ ; GCN-LABEL: {{^}}v_icmp_u32_ugt: ; GCN: v_cmp_gt_u32_e64 -define void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34) store i64 %result, i64 addrspace(1)* %out ret void @@ -46,7 +46,7 @@ ; GCN-LABEL: {{^}}v_icmp_u32_uge: ; GCN: v_cmp_ge_u32_e64 -define void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35) store i64 %result, i64 addrspace(1)* %out ret void @@ -54,7 +54,7 @@ ; GCN-LABEL: {{^}}v_icmp_u32_ult: ; GCN: v_cmp_lt_u32_e64 -define void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36) store i64 %result, i64 addrspace(1)* %out ret void @@ -62,7 +62,7 @@ ; GCN-LABEL: {{^}}v_icmp_u32_ule: ; GCN: v_cmp_le_u32_e64 -define void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37) store i64 %result, i64 addrspace(1)* %out ret void @@ -70,7 +70,7 @@ ; GCN-LABEL: {{^}}v_icmp_i32_sgt: ; GCN: v_cmp_gt_i32_e64 -define void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 { +define amdgpu_kernel void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38) store i64 %result, i64 addrspace(1)* %out ret void @@ -78,7 +78,7 @@ ; GCN-LABEL: {{^}}v_icmp_i32_sge: ; GCN: v_cmp_ge_i32_e64 -define void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39) store i64 %result, i64 addrspace(1)* %out ret void @@ -86,14 +86,14 @@ ; GCN-LABEL: {{^}}v_icmp_i32_slt: ; GCN: v_cmp_lt_i32_e64 -define void @v_icmp_i32_slt(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_i32_slt(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40) store i64 %result, i64 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}v_icmp_i32_sle: ; GCN: v_cmp_le_i32_e64 -define void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41) store i64 %result, i64 addrspace(1)* %out ret void @@ -101,7 +101,7 @@ ; GCN-LABEL: {{^}}v_icmp_i64_eq: ; GCN: v_cmp_eq_u64_e64 -define void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32) store i64 %result, i64 addrspace(1)* %out ret void @@ -109,7 +109,7 @@ ; GCN-LABEL: {{^}}v_icmp_i64_ne: ; GCN: v_cmp_ne_u64_e64 -define void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33) store i64 %result, i64 addrspace(1)* %out ret void @@ -117,7 +117,7 @@ ; GCN-LABEL: {{^}}v_icmp_u64_ugt: ; GCN: v_cmp_gt_u64_e64 -define void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34) store i64 %result, i64 addrspace(1)* %out ret void @@ -125,7 +125,7 @@ ; GCN-LABEL: {{^}}v_icmp_u64_uge: ; GCN: v_cmp_ge_u64_e64 -define void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35) store i64 %result, i64 addrspace(1)* %out ret void @@ -133,7 +133,7 @@ ; GCN-LABEL: {{^}}v_icmp_u64_ult: ; GCN: v_cmp_lt_u64_e64 -define void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36) store i64 %result, i64 addrspace(1)* %out ret void @@ -141,7 +141,7 @@ ; GCN-LABEL: {{^}}v_icmp_u64_ule: ; GCN: v_cmp_le_u64_e64 -define void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37) store i64 %result, i64 addrspace(1)* %out ret void @@ -149,7 +149,7 @@ ; GCN-LABEL: {{^}}v_icmp_i64_sgt: ; GCN: v_cmp_gt_i64_e64 -define void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38) store i64 %result, i64 addrspace(1)* %out ret void @@ -157,7 +157,7 @@ ; GCN-LABEL: {{^}}v_icmp_i64_sge: ; GCN: v_cmp_ge_i64_e64 -define void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39) store i64 %result, i64 addrspace(1)* %out ret void @@ -165,14 +165,14 @@ ; GCN-LABEL: {{^}}v_icmp_i64_slt: ; GCN: v_cmp_lt_i64_e64 -define void @v_icmp_i64_slt(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_i64_slt(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40) store i64 %result, i64 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}v_icmp_i64_sle: ; GCN: v_cmp_le_i64_e64 -define void @v_icmp_i64_sle(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_i64_sle(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41) store i64 %result, i64 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}gather4_v2: ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_v2(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_v2(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -12,7 +12,7 @@ ; GCN-LABEL: {{^}}gather4: ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -21,7 +21,7 @@ ; GCN-LABEL: {{^}}gather4_cl: ; GCN: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -30,7 +30,7 @@ ; GCN-LABEL: {{^}}gather4_l: ; GCN: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_l(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_l(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -39,7 +39,7 @@ ; GCN-LABEL: {{^}}gather4_b: ; GCN: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_b(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -48,7 +48,7 @@ ; GCN-LABEL: {{^}}gather4_b_cl: ; GCN: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_b_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -57,7 +57,7 @@ ; GCN-LABEL: {{^}}gather4_b_cl_v8: ; GCN: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_cl_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_b_cl_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -66,7 +66,7 @@ ; GCN-LABEL: {{^}}gather4_lz_v2: ; GCN: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_lz_v2(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_lz_v2(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -75,7 +75,7 @@ ; GCN-LABEL: {{^}}gather4_lz: ; GCN: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_lz(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_lz(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -86,7 +86,7 @@ ; GCN-LABEL: {{^}}gather4_o: ; GCN: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -95,7 +95,7 @@ ; GCN-LABEL: {{^}}gather4_cl_o: ; GCN: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_cl_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_cl_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -104,7 +104,7 @@ ; GCN-LABEL: {{^}}gather4_cl_o_v8: ; GCN: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_cl_o_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_cl_o_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -113,7 +113,7 @@ ; GCN-LABEL: {{^}}gather4_l_o: ; GCN: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_l_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_l_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -122,7 +122,7 @@ ; GCN-LABEL: {{^}}gather4_l_o_v8: ; GCN: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_l_o_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_l_o_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.l.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -131,7 +131,7 @@ ; GCN-LABEL: {{^}}gather4_b_o: ; GCN: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_b_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -140,7 +140,7 @@ ; GCN-LABEL: {{^}}gather4_b_o_v8: ; GCN: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_o_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_b_o_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.b.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -149,7 +149,7 @@ ; GCN-LABEL: {{^}}gather4_b_cl_o: ; GCN: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_cl_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_b_cl_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -158,7 +158,7 @@ ; GCN-LABEL: {{^}}gather4_lz_o: ; GCN: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_lz_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_lz_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -168,7 +168,7 @@ ; GCN-LABEL: {{^}}gather4_c: ; GCN: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -177,7 +177,7 @@ ; GCN-LABEL: {{^}}gather4_c_cl: ; GCN: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -186,7 +186,7 @@ ; GCN-LABEL: {{^}}gather4_c_cl_v8: ; GCN: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_cl_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_cl_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -195,7 +195,7 @@ ; GCN-LABEL: {{^}}gather4_c_l: ; GCN: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_l(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_l(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -204,7 +204,7 @@ ; GCN-LABEL: {{^}}gather4_c_l_v8: ; GCN: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_l_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_l_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -213,7 +213,7 @@ ; GCN-LABEL: {{^}}gather4_c_b: ; GCN: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_b(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -222,7 +222,7 @@ ; GCN-LABEL: {{^}}gather4_c_b_v8: ; GCN: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_b_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -231,7 +231,7 @@ ; GCN-LABEL: {{^}}gather4_c_b_cl: ; GCN: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_b_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -240,7 +240,7 @@ ; GCN-LABEL: {{^}}gather4_c_lz: ; GCN: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_lz(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_lz(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -250,7 +250,7 @@ ; GCN-LABEL: {{^}}gather4_c_o: ; GCN: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -259,7 +259,7 @@ ; GCN-LABEL: {{^}}gather4_c_o_v8: ; GCN: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_o_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_o_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -268,7 +268,7 @@ ; GCN-LABEL: {{^}}gather4_c_cl_o: ; GCN: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_cl_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_cl_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -277,7 +277,7 @@ ; GCN-LABEL: {{^}}gather4_c_l_o: ; GCN: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_l_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_l_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -286,7 +286,7 @@ ; GCN-LABEL: {{^}}gather4_c_b_o: ; GCN: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_b_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -295,7 +295,7 @@ ; GCN-LABEL: {{^}}gather4_c_b_cl_o: ; GCN: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b_cl_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_b_cl_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -304,7 +304,7 @@ ; GCN-LABEL: {{^}}gather4_c_lz_o: ; GCN: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_lz_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_lz_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -313,7 +313,7 @@ ; GCN-LABEL: {{^}}gather4_c_lz_o_v8: ; GCN: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_lz_o_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_lz_o_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -322,7 +322,7 @@ ; GCN-LABEL: {{^}}gather4_f32: ; GCN: image_gather4 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @gather4_f32(float addrspace(1)* %out) { main_body: %r = call float @llvm.amdgcn.image.gather4.f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store float %r, float addrspace(1)* %out @@ -331,7 +331,7 @@ ; GCN-LABEL: {{^}}gather4_v2f32: ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da -define void @gather4_v2f32(<2 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_v2f32(<2 x float> addrspace(1)* %out) { main_body: %r = call <2 x float> @llvm.amdgcn.image.gather4.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 1) store <2 x float> %r, <2 x float> addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}getlod: ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da -define void @getlod(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @getlod(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.f32.v8i32(float undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -12,7 +12,7 @@ ; GCN-LABEL: {{^}}getlod_v2: ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da -define void @getlod_v2(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @getlod_v2(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -21,7 +21,7 @@ ; GCN-LABEL: {{^}}getlod_v4: ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da -define void @getlod_v4(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @getlod_v4(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -31,7 +31,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_getlod_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_getlod_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_getlod_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}sample: ; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -12,7 +12,7 @@ ; GCN-LABEL: {{^}}sample_cl: ; GCN: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -21,7 +21,7 @@ ; GCN-LABEL: {{^}}sample_d: ; GCN: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_d(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_d(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -30,7 +30,7 @@ ; GCN-LABEL: {{^}}sample_d_cl: ; GCN: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_d_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_d_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -39,7 +39,7 @@ ; GCN-LABEL: {{^}}sample_l: ; GCN: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_l(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_l(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -48,7 +48,7 @@ ; GCN-LABEL: {{^}}sample_b: ; GCN: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_b(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_b(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -57,7 +57,7 @@ ; GCN-LABEL: {{^}}sample_b_cl: ; GCN: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_b_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_b_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -66,7 +66,7 @@ ; GCN-LABEL: {{^}}sample_lz: ; GCN: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_lz(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_lz(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -75,7 +75,7 @@ ; GCN-LABEL: {{^}}sample_cd: ; GCN: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cd(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_cd(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -84,7 +84,7 @@ ; GCN-LABEL: {{^}}sample_cd_cl: ; GCN: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cd_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_cd_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -93,7 +93,7 @@ ; GCN-LABEL: {{^}}sample_c: ; GCN: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -102,7 +102,7 @@ ; GCN-LABEL: {{^}}sample_c_cl: ; GCN: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -111,7 +111,7 @@ ; GCN-LABEL: {{^}}sample_c_d: ; GCN: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_d(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_d(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -120,7 +120,7 @@ ; GCN-LABEL: {{^}}sample_c_d_cl: ; GCN: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_d_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_d_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -129,7 +129,7 @@ ; GCN-LABEL: {{^}}sample_c_l: ; GCN: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_l(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_l(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -138,7 +138,7 @@ ; GCN-LABEL: {{^}}sample_c_b: ; GCN: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_b(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_b(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -147,7 +147,7 @@ ; GCN-LABEL: {{^}}sample_c_b_cl: ; GCN: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_b_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_b_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -156,7 +156,7 @@ ; GCN-LABEL: {{^}}sample_c_lz: ; GCN: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_lz(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_lz(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -165,7 +165,7 @@ ; GCN-LABEL: {{^}}sample_c_cd: ; GCN: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cd(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_cd(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -174,7 +174,7 @@ ; GCN-LABEL: {{^}}sample_c_cd_cl: ; GCN: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -183,7 +183,7 @@ ; GCN-LABEL: {{^}}sample_f32: ; GCN: image_sample {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 -define void @sample_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @sample_f32(float addrspace(1)* %out) { main_body: %r = call float @llvm.amdgcn.image.sample.f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0) store float %r, float addrspace(1)* %out @@ -192,7 +192,7 @@ ; GCN-LABEL: {{^}}sample_v2f32: ; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 -define void @sample_v2f32(<2 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_v2f32(<2 x float> addrspace(1)* %out) { main_body: %r = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0) store <2 x float> %r, <2 x float> addrspace(1)* %out @@ -201,7 +201,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_0: ; GCN: image_sample v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1{{$}} -define void @adjust_writemask_sample_0(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_0(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) %elt0 = extractelement <4 x float> %r, i32 0 @@ -211,7 +211,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_01: ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x3{{$}} -define void @adjust_writemask_sample_01(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_01(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) %elt0 = extractelement <4 x float> %r, i32 0 @@ -223,7 +223,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_012: ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x7{{$}} -define void @adjust_writemask_sample_012(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_012(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) %elt0 = extractelement <4 x float> %r, i32 0 @@ -237,7 +237,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_12: ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x6{{$}} -define void @adjust_writemask_sample_12(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_12(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) %elt1 = extractelement <4 x float> %r, i32 1 @@ -249,7 +249,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_03: ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x9{{$}} -define void @adjust_writemask_sample_03(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_03(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) %elt0 = extractelement <4 x float> %r, i32 0 @@ -261,7 +261,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_13: ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xa{{$}} -define void @adjust_writemask_sample_13(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_13(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) %elt1 = extractelement <4 x float> %r, i32 1 @@ -273,7 +273,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_123: ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xe{{$}} -define void @adjust_writemask_sample_123(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_123(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) %elt1 = extractelement <4 x float> %r, i32 1 @@ -288,7 +288,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_variable_dmask_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_variable_dmask_enabled(float addrspace(1)* %out, i32 %dmask) { +define amdgpu_kernel void @adjust_writemask_sample_variable_dmask_enabled(float addrspace(1)* %out, i32 %dmask) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 %dmask, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -300,7 +300,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -311,7 +311,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_cl_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_cl_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_cl_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -322,7 +322,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_d_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_d_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_d_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -333,7 +333,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_d_cl_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_d_cl_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_d_cl_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -344,7 +344,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_l_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_l_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_l_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -355,7 +355,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_b_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_b_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_b_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -366,7 +366,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_b_cl_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_b_cl_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_b_cl_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -377,7 +377,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_lz_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_lz_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_lz_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -388,7 +388,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_cd_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_cd_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_cd_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -399,7 +399,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_cd_cl_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_cd_cl_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_cd_cl_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}sample: ; GCN: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -12,7 +12,7 @@ ; GCN-LABEL: {{^}}sample_cl: ; GCN: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -21,7 +21,7 @@ ; GCN-LABEL: {{^}}sample_d: ; GCN: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_d(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_d(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -30,7 +30,7 @@ ; GCN-LABEL: {{^}}sample_d_cl: ; GCN: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_d_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_d_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -39,7 +39,7 @@ ; GCN-LABEL: {{^}}sample_l: ; GCN: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_l(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_l(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -48,7 +48,7 @@ ; GCN-LABEL: {{^}}sample_b: ; GCN: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_b(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_b(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -57,7 +57,7 @@ ; GCN-LABEL: {{^}}sample_b_cl: ; GCN: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_b_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_b_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -66,7 +66,7 @@ ; GCN-LABEL: {{^}}sample_lz: ; GCN: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_lz(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_lz(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -75,7 +75,7 @@ ; GCN-LABEL: {{^}}sample_cd: ; GCN: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cd(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_cd(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -84,7 +84,7 @@ ; GCN-LABEL: {{^}}sample_cd_cl: ; GCN: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cd_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_cd_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -93,7 +93,7 @@ ; GCN-LABEL: {{^}}sample_c: ; GCN: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -102,7 +102,7 @@ ; GCN-LABEL: {{^}}sample_c_cl: ; GCN: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -111,7 +111,7 @@ ; GCN-LABEL: {{^}}sample_c_d: ; GCN: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_d(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_d(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -120,7 +120,7 @@ ; GCN-LABEL: {{^}}sample_c_d_cl: ; GCN: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_d_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_d_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -129,7 +129,7 @@ ; GCN-LABEL: {{^}}sample_c_l: ; GCN: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_l(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_l(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -138,7 +138,7 @@ ; GCN-LABEL: {{^}}sample_c_b: ; GCN: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_b(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_b(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -147,7 +147,7 @@ ; GCN-LABEL: {{^}}sample_c_b_cl: ; GCN: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_b_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_b_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -156,7 +156,7 @@ ; GCN-LABEL: {{^}}sample_c_lz: ; GCN: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_lz(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_lz(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -165,7 +165,7 @@ ; GCN-LABEL: {{^}}sample_c_cd: ; GCN: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cd(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_cd(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -174,7 +174,7 @@ ; GCN-LABEL: {{^}}sample_c_cd_cl: ; GCN: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -184,7 +184,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -195,7 +195,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_cl_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_cl_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_cl_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -206,7 +206,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_d_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_d_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_d_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -217,7 +217,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_d_cl_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_d_cl_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_d_cl_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -228,7 +228,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_l_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_l_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_l_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -239,7 +239,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_b_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_b_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_b_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -250,7 +250,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_b_cl_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_b_cl_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_b_cl_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -261,7 +261,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_lz_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_lz_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_lz_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -272,7 +272,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_cd_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_cd_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_cd_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -283,7 +283,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_cd_cl_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_cd_cl_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_cd_cl_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -294,7 +294,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_c_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_c_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_c_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -305,7 +305,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_c_cl_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_c_cl_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_c_cl_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -316,7 +316,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_c_d_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_c_d_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_c_d_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -327,7 +327,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_c_d_cl_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_c_d_cl_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_c_d_cl_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -338,7 +338,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_c_l_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_c_l_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_c_l_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -349,7 +349,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_c_b_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_c_b_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_c_b_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -360,7 +360,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_c_b_cl_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_c_b_cl_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_c_b_cl_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -371,7 +371,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_c_lz_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_c_lz_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_c_lz_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -382,7 +382,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_c_cd_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_c_cd_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_c_cd_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 @@ -393,7 +393,7 @@ ; GCN-LABEL: {{^}}adjust_writemask_sample_c_cd_cl_o_none_enabled: ; GCN-NOT: image ; GCN-NOT: store -define void @adjust_writemask_sample_c_cd_cl_o_none_enabled(float addrspace(1)* %out) { +define amdgpu_kernel void @adjust_writemask_sample_c_cd_cl_o_none_enabled(float addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) %elt0 = extractelement <4 x float> %r, i32 0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -8,7 +8,7 @@ ; CO-V2: s_load_dword s{{[0-9]+}}, s[4:5], 0xa ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa -define void @test(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 { %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)* %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10 @@ -20,7 +20,7 @@ ; ALL-LABEL: {{^}}test_implicit: ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15 -define void @test_implicit(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 { %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() %header.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)* %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10 @@ -39,7 +39,7 @@ ; ALL: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[VAL]] ; MESA: buffer_store_dword [[V_VAL]] ; HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]] -define void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 { +define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 { %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() %arg.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)* %val = load i32, i32 addrspace(2)* %arg.ptr Index: test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll @@ -7,7 +7,7 @@ ; GCN: buffer_load_dword v[[B_I32:[0-9]+]] ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_I32]] ; GCN: buffer_store_short v[[R_F16]] -define void @ldexp_f16( +define amdgpu_kernel void @ldexp_f16( half addrspace(1)* %r, half addrspace(1)* %a, i32 addrspace(1)* %b) { @@ -22,7 +22,7 @@ ; GCN: buffer_load_dword v[[B_I32:[0-9]+]] ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[B_I32]] ; GCN: buffer_store_short v[[R_F16]] -define void @ldexp_f16_imm_a( +define amdgpu_kernel void @ldexp_f16_imm_a( half addrspace(1)* %r, i32 addrspace(1)* %b) { %b.val = load i32, i32 addrspace(1)* %b @@ -35,7 +35,7 @@ ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; VI: v_ldexp_f16_e64 v[[R_F16:[0-9]+]], v[[A_F16]], 2{{$}} ; GCN: buffer_store_short v[[R_F16]] -define void @ldexp_f16_imm_b( +define amdgpu_kernel void @ldexp_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { %a.val = load half, half addrspace(1)* %a Index: test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll @@ -7,7 +7,7 @@ ; SI-LABEL: {{^}}test_ldexp_f32: ; SI: v_ldexp_f32 ; SI: s_endpgm -define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind { +define amdgpu_kernel void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind { %result = call float @llvm.amdgcn.ldexp.f32(float %a, i32 %b) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -16,7 +16,7 @@ ; SI-LABEL: {{^}}test_ldexp_f64: ; SI: v_ldexp_f64 ; SI: s_endpgm -define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind { +define amdgpu_kernel void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind { %result = call double @llvm.amdgcn.ldexp.f64(double %a, i32 %b) nounwind readnone store double %result, double addrspace(1)* %out, align 8 ret void @@ -24,7 +24,7 @@ ; SI-LABEL: {{^}}test_ldexp_undef_f32: ; SI-NOT: v_ldexp_f32 -define void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind { +define amdgpu_kernel void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind { %result = call float @llvm.amdgcn.ldexp.f32(float undef, i32 %b) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_lerp: ; GCN: v_lerp_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind { +define amdgpu_kernel void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind { %result= call i32 @llvm.amdgcn.lerp(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll @@ -7,7 +7,7 @@ ; GCN-LABEL: {{^}}v_log_clamp_f32: ; GCN: v_log_clamp_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 { %log.clamp = call float @llvm.amdgcn.log.clamp.f32(float %src) #0 store float %log.clamp, float addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll @@ -7,7 +7,7 @@ ; VI: v_mov_b32_e32 v0, s{{[0-9]+}} ; VI: s_nop 1 ; VI: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] -define void @dpp_test(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) { %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0 store i32 %tmp0, i32 addrspace(1)* %out ret void @@ -19,7 +19,7 @@ ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; VI: s_nop 1 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 -define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) { %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0 %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0 store i32 %tmp1, i32 addrspace(1)* %out @@ -36,7 +36,7 @@ ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; VI: s_nop 1 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 -define void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) { +define amdgpu_kernel void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) { %cmp = fcmp oeq float %cond, 0.0 br i1 %cmp, label %if, label %else Index: test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8: ; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0 store i64 %result, i64 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8_non_immediate: ; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) { +define amdgpu_kernel void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) { %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0 store i64 %result, i64 addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_non_inline_constant: ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) { %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 100, <4 x i32> ) #0 store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}v_mqsad_u32_u8_non_immediate: ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) { +define amdgpu_kernel void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) { %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %b) #0 store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate: ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) { +define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) { %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> ) #0 store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4 ret void @@ -29,7 +29,7 @@ ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_fp_immediate: ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) { +define amdgpu_kernel void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) { %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> ) #0 store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4 ret void @@ -37,7 +37,7 @@ ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_sgpr_vgpr: ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) { +define amdgpu_kernel void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) { %in = load <4 x i32>, <4 x i32> addrspace(1) * %input %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %in) #0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_msad_u8: ; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) { %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}v_msad_u8_non_immediate: ; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_msad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { +define amdgpu_kernel void @v_msad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 %a, i32 %b) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8: ; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0 store i64 %result, i64 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8_non_immediate: ; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) { +define amdgpu_kernel void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) { %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0 store i64 %result, i64 addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}test: ; GCN: enable_sgpr_queue_ptr = 1 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -define void @test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out) { %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* %value = load i32, i32 addrspace(2)* %header_ptr Index: test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll @@ -7,7 +7,7 @@ ; VI: v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @rcp_f16( +define amdgpu_kernel void @rcp_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll @@ -7,7 +7,7 @@ ; GCN-LABEL: {{^}}rcp_legacy_f32: ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 { %rcp = call float @llvm.amdgcn.rcp.legacy(float %src) #0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -16,7 +16,7 @@ ; TODO: Really these should be constant folded ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_4.0 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 4.0 -define void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 { %rcp = call float @llvm.amdgcn.rcp.legacy(float 4.0) #0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -24,7 +24,7 @@ ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_100.0 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000 -define void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 { %rcp = call float @llvm.amdgcn.rcp.legacy(float 100.0) #0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -32,7 +32,7 @@ ; GCN-LABEL: {{^}}rcp_legacy_undef_f32: ; GCN-NOT: v_rcp_legacy_f32 -define void @rcp_legacy_undef_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rcp_legacy_undef_f32(float addrspace(1)* %out) #1 { %rcp = call float @llvm.amdgcn.rcp.legacy(float undef) store float %rcp, float addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -8,7 +8,7 @@ ; FUNC-LABEL: {{^}}rcp_undef_f32: ; SI-NOT: v_rcp_f32 -define void @rcp_undef_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rcp_undef_f32(float addrspace(1)* %out) #1 { %rcp = call float @llvm.amdgcn.rcp.f32(float undef) store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -17,7 +17,7 @@ ; FUNC-LABEL: {{^}}rcp_2_f32: ; SI-NOT: v_rcp_f32 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0.5 -define void @rcp_2_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rcp_2_f32(float addrspace(1)* %out) #1 { %rcp = call float @llvm.amdgcn.rcp.f32(float 2.0) store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -26,7 +26,7 @@ ; FUNC-LABEL: {{^}}rcp_10_f32: ; SI-NOT: v_rcp_f32 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0x3dcccccd -define void @rcp_10_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rcp_10_f32(float addrspace(1)* %out) #1 { %rcp = call float @llvm.amdgcn.rcp.f32(float 10.0) store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -36,7 +36,7 @@ ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}} ; SI-NOT: [[RESULT]] ; SI: buffer_store_dword [[RESULT]] -define void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 { %rcp = fdiv float 1.0, %src store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -46,7 +46,7 @@ ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}} ; SI-NOT: [[RESULT]] ; SI: buffer_store_dword [[RESULT]] -define void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 { +define amdgpu_kernel void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 { %rcp = fdiv float 1.0, %src store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -54,7 +54,7 @@ ; FUNC-LABEL: {{^}}unsafe_f32_denormals_rcp_pat_f32: ; SI: v_div_scale_f32 -define void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 { +define amdgpu_kernel void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 { %rcp = fdiv float 1.0, %src store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -63,7 +63,7 @@ ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32: ; SI: v_sqrt_f32_e32 ; SI: v_rcp_f32_e32 -define void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 { %sqrt = call float @llvm.sqrt.f32(float %src) %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt) store float %rcp, float addrspace(1)* %out, align 4 @@ -72,7 +72,7 @@ ; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f32: ; SI: v_rsq_f32_e32 -define void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 { +define amdgpu_kernel void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 { %sqrt = call float @llvm.sqrt.f32(float %src) %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt) store float %rcp, float addrspace(1)* %out, align 4 @@ -83,7 +83,7 @@ ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}} ; SI-NOT: [[RESULT]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @rcp_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double %src) #1 { %rcp = call double @llvm.amdgcn.rcp.f64(double %src) store double %rcp, double addrspace(1)* %out, align 8 ret void @@ -93,7 +93,7 @@ ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}} ; SI-NOT: [[RESULT]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 { +define amdgpu_kernel void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 { %rcp = call double @llvm.amdgcn.rcp.f64(double %src) store double %rcp, double addrspace(1)* %out, align 8 ret void @@ -101,7 +101,7 @@ ; FUNC-LABEL: {{^}}rcp_pat_f64: ; SI: v_div_scale_f64 -define void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { %rcp = fdiv double 1.0, %src store double %rcp, double addrspace(1)* %out, align 8 ret void @@ -111,7 +111,7 @@ ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}} ; SI-NOT: [[RESULT]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 { +define amdgpu_kernel void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 { %rcp = fdiv double 1.0, %src store double %rcp, double addrspace(1)* %out, align 8 ret void @@ -121,7 +121,7 @@ ; SI-NOT: v_rsq_f64_e32 ; SI: v_sqrt_f64 ; SI: v_rcp_f64 -define void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { %sqrt = call double @llvm.sqrt.f64(double %src) %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt) store double %rcp, double addrspace(1)* %out, align 8 @@ -132,7 +132,7 @@ ; SI: v_rsq_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}} ; SI-NOT: [[RESULT]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 { +define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 { %sqrt = call double @llvm.sqrt.f64(double %src) %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt) store double %rcp, double addrspace(1)* %out, align 8 Index: test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: {{^}}test_readfirstlane: ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}} -define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 { +define amdgpu_kernel void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 { %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ ; CHECK-LABEL: {{^}}test_readfirstlane_imm: ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]] -define void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 { %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void @@ -25,7 +25,7 @@ ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]] ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]] -define void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 { %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"() %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: {{^}}test_readlane_sreg: ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -define void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { +define amdgpu_kernel void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1) store i32 %readlane, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ ; CHECK-LABEL: {{^}}test_readlane_imm_sreg: ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32 ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}} -define void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { +define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { %readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1) store i32 %readlane, i32 addrspace(1)* %out, align 4 ret void @@ -25,7 +25,7 @@ ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]] ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}} -define void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { +define amdgpu_kernel void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"() %readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1) store i32 %readlane, i32 addrspace(1)* %out, align 4 @@ -34,7 +34,7 @@ ; CHECK-LABEL: {{^}}test_readlane_imm: ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32 -define void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 { +define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 { %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 32) #0 store i32 %readlane, i32 addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll @@ -12,7 +12,7 @@ ; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]] ; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xff7fffff, [[MIN]] ; VI: buffer_store_dword [[RESULT]] -define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 { %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src) store float %rsq_clamp, float addrspace(1)* %out ret void @@ -30,7 +30,7 @@ ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}} ; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] ; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]] -define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 { +define amdgpu_kernel void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 { %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src) store double %rsq_clamp, double addrspace(1)* %out ret void @@ -38,7 +38,7 @@ ; FUNC-LABEL: {{^}}rsq_clamp_undef_f32: ; SI-NOT: v_rsq_clamp_f32 -define void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 { %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef) store float %rsq_clamp, float addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll @@ -7,7 +7,7 @@ ; VI: v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @rsq_f16( +define amdgpu_kernel void @rsq_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}rsq_legacy_f32: ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 { %rsq = call float @llvm.amdgcn.rsq.legacy(float %src) #0 store float %rsq, float addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ ; TODO: Really these should be constant folded ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_4.0 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 4.0 -define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 { %rsq = call float @llvm.amdgcn.rsq.legacy(float 4.0) #0 store float %rsq, float addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_100.0 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000 -define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 { %rsq = call float @llvm.amdgcn.rsq.legacy(float 100.0) #0 store float %rsq, float addrspace(1)* %out, align 4 ret void @@ -29,7 +29,7 @@ ; FUNC-LABEL: {{^}}rsq_legacy_undef_f32: ; SI-NOT: v_rsq_legacy_f32 -define void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 { %rsq = call float @llvm.amdgcn.rsq.legacy(float undef) store float %rsq, float addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll @@ -6,7 +6,7 @@ ; FUNC-LABEL: {{^}}rsq_f32: ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @rsq_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @rsq_f32(float addrspace(1)* %out, float %src) #1 { %rsq = call float @llvm.amdgcn.rsq.f32(float %src) #0 store float %rsq, float addrspace(1)* %out, align 4 ret void @@ -15,7 +15,7 @@ ; TODO: Really these should be constant folded ; FUNC-LABEL: {{^}}rsq_f32_constant_4.0 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0 -define void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 { %rsq = call float @llvm.amdgcn.rsq.f32(float 4.0) #0 store float %rsq, float addrspace(1)* %out, align 4 ret void @@ -23,7 +23,7 @@ ; FUNC-LABEL: {{^}}rsq_f32_constant_100.0 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000 -define void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 { %rsq = call float @llvm.amdgcn.rsq.f32(float 100.0) #0 store float %rsq, float addrspace(1)* %out, align 4 ret void @@ -31,7 +31,7 @@ ; FUNC-LABEL: {{^}}rsq_f64: ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @rsq_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @rsq_f64(double addrspace(1)* %out, double %src) #1 { %rsq = call double @llvm.amdgcn.rsq.f64(double %src) #0 store double %rsq, double addrspace(1)* %out, align 4 ret void @@ -40,7 +40,7 @@ ; TODO: Really these should be constant folded ; FUNC-LABEL: {{^}}rsq_f64_constant_4.0 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, 4.0 -define void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 { %rsq = call double @llvm.amdgcn.rsq.f64(double 4.0) #0 store double %rsq, double addrspace(1)* %out, align 4 ret void @@ -50,7 +50,7 @@ ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0x40590000 ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0{{$}} ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 { %rsq = call double @llvm.amdgcn.rsq.f64(double 100.0) #0 store double %rsq, double addrspace(1)* %out, align 4 ret void @@ -58,7 +58,7 @@ ; FUNC-LABEL: {{^}}rsq_undef_f32: ; SI-NOT: v_rsq_f32 -define void @rsq_undef_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_undef_f32(float addrspace(1)* %out) #1 { %rsq = call float @llvm.amdgcn.rsq.f32(float undef) store float %rsq, float addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -7,7 +7,7 @@ ; GFX9: flat_store_dword ; GFX9-NOT: s_waitcnt ; GCN: s_barrier -define void @test_barrier(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out) #0 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll @@ -9,7 +9,7 @@ ; SI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0xc0,0xc7] ; VI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0x80,0xc0,0x00,0x00,0x00,0x00] ; GCN-NEXT: s_endpgm -define void @test_s_dcache_inv() #0 { +define amdgpu_kernel void @test_s_dcache_inv() #0 { call void @llvm.amdgcn.s.dcache.inv() ret void } @@ -18,7 +18,7 @@ ; GCN-NEXT: ; BB#0: ; GCN: s_dcache_inv ; GCN: s_waitcnt lgkmcnt(0) ; encoding -define void @test_s_dcache_inv_insert_wait() #0 { +define amdgpu_kernel void @test_s_dcache_inv_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.inv() call void @llvm.amdgcn.s.waitcnt(i32 0) br label %end Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll @@ -9,7 +9,7 @@ ; CI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x40,0xc7] ; VI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x88,0xc0,0x00,0x00,0x00,0x00] ; GCN-NEXT: s_endpgm -define void @test_s_dcache_inv_vol() #0 { +define amdgpu_kernel void @test_s_dcache_inv_vol() #0 { call void @llvm.amdgcn.s.dcache.inv.vol() ret void } @@ -18,7 +18,7 @@ ; GCN-NEXT: ; BB#0: ; GCN-NEXT: s_dcache_inv_vol ; GCN: s_waitcnt lgkmcnt(0) ; encoding -define void @test_s_dcache_inv_vol_insert_wait() #0 { +define amdgpu_kernel void @test_s_dcache_inv_vol_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.inv.vol() call void @llvm.amdgcn.s.waitcnt(i32 0) br label %end Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll @@ -7,7 +7,7 @@ ; VI-NEXT: ; BB#0: ; VI-NEXT: s_dcache_wb ; encoding: [0x00,0x00,0x84,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_endpgm -define void @test_s_dcache_wb() #0 { +define amdgpu_kernel void @test_s_dcache_wb() #0 { call void @llvm.amdgcn.s.dcache.wb() ret void } @@ -16,7 +16,7 @@ ; VI-NEXT: ; BB#0: ; VI-NEXT: s_dcache_wb ; VI: s_waitcnt lgkmcnt(0) ; encoding -define void @test_s_dcache_wb_insert_wait() #0 { +define amdgpu_kernel void @test_s_dcache_wb_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.wb() call void @llvm.amdgcn.s.waitcnt(i32 0) br label %end Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll @@ -7,7 +7,7 @@ ; VI-NEXT: ; BB#0: ; VI-NEXT: s_dcache_wb_vol ; encoding: [0x00,0x00,0x8c,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_endpgm -define void @test_s_dcache_wb_vol() #0 { +define amdgpu_kernel void @test_s_dcache_wb_vol() #0 { call void @llvm.amdgcn.s.dcache.wb.vol() ret void } @@ -16,7 +16,7 @@ ; VI-NEXT: ; BB#0: ; VI-NEXT: s_dcache_wb_vol ; VI: s_waitcnt lgkmcnt(0) ; encoding -define void @test_s_dcache_wb_vol_insert_wait() #0 { +define amdgpu_kernel void @test_s_dcache_wb_vol_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.wb.vol() call void @llvm.amdgcn.s.waitcnt(i32 0) br label %end Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll @@ -20,7 +20,7 @@ ; GCN: s_decperflevel 13{{$}} ; GCN: s_decperflevel 14{{$}} ; GCN: s_decperflevel 15{{$}} -define void @test_s_decperflevel(i32 %x) #0 { +define amdgpu_kernel void @test_s_decperflevel(i32 %x) #0 { call void @llvm.amdgcn.s.decperflevel(i32 0) call void @llvm.amdgcn.s.decperflevel(i32 1) call void @llvm.amdgcn.s.decperflevel(i32 2) Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}s_getreg_test: ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23) -define void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size. +define amdgpu_kernel void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size. %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574) %lds_size_bytes = shl i32 %lds_size_64dwords, 8 store i32 %lds_size_bytes, i32 addrspace(1)* %out @@ -14,7 +14,7 @@ ; Call site has additional readnone knowledge. ; GCN-LABEL: {{^}}readnone_s_getreg_test: ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23) -define void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size. +define amdgpu_kernel void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size. %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574) #1 %lds_size_bytes = shl i32 %lds_size_64dwords, 8 store i32 %lds_size_bytes, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll @@ -20,7 +20,7 @@ ; GCN: s_incperflevel 13{{$}} ; GCN: s_incperflevel 14{{$}} ; GCN: s_incperflevel 15{{$}} -define void @test_s_incperflevel(i32 %x) #0 { +define amdgpu_kernel void @test_s_incperflevel(i32 %x) #0 { call void @llvm.amdgcn.s.incperflevel(i32 0) call void @llvm.amdgcn.s.incperflevel(i32 1) call void @llvm.amdgcn.s.incperflevel(i32 2) Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll @@ -10,7 +10,7 @@ ; GCN-NOT: lgkmcnt ; GCN: s_memrealtime s{{\[[0-9]+:[0-9]+\]}} ; GCN: _store_dwordx2 -define void @test_s_memrealtime(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_s_memrealtime(i64 addrspace(1)* %out) #0 { %cycle0 = call i64 @llvm.amdgcn.s.memrealtime() store volatile i64 %cycle0, i64 addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll @@ -11,7 +11,7 @@ ; GCN-NOT: lgkmcnt ; GCN: s_memtime s{{\[[0-9]+:[0-9]+\]}} ; GCN: buffer_store_dwordx2 -define void @test_s_memtime(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_s_memtime(i64 addrspace(1)* %out) #0 { %cycle0 = call i64 @llvm.amdgcn.s.memtime() store volatile i64 %cycle0, i64 addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll @@ -20,7 +20,7 @@ ; GCN: s_sleep 13{{$}} ; GCN: s_sleep 14{{$}} ; GCN: s_sleep 15{{$}} -define void @test_s_sleep(i32 %x) #0 { +define amdgpu_kernel void @test_s_sleep(i32 %x) #0 { call void @llvm.amdgcn.s.sleep(i32 0) call void @llvm.amdgcn.s.sleep(i32 1) call void @llvm.amdgcn.s.sleep(i32 2) Index: test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_sad_hi_u8: ; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) { %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}v_sad_hi_u8_non_immediate: ; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_hi_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { +define amdgpu_kernel void @v_sad_hi_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 %a, i32 %b) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_sad_u16: ; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) { %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}v_sad_u16_non_immediate: ; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u16_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { +define amdgpu_kernel void @v_sad_u16_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 %a, i32 %b) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_sad_u8: ; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) { %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}v_sad_u8_non_immediate: ; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { +define amdgpu_kernel void @v_sad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 %a, i32 %b) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg: ; GCN: v_bfe_i32 -define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { +define amdgpu_kernel void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src1) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -11,7 +11,7 @@ ; GCN-LABEL: {{^}}bfe_i32_arg_arg_imm: ; GCN: v_bfe_i32 -define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 123) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -19,7 +19,7 @@ ; GCN-LABEL: {{^}}bfe_i32_arg_imm_arg: ; GCN: v_bfe_i32 -define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { +define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 123, i32 %src2) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -27,7 +27,7 @@ ; GCN-LABEL: {{^}}bfe_i32_imm_arg_arg: ; GCN: v_bfe_i32 -define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { +define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 123, i32 %src1, i32 %src2) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -35,7 +35,7 @@ ; GCN-LABEL: {{^}}v_bfe_print_arg: ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8 -define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 { +define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 { %load = load i32, i32 addrspace(1)* %src0, align 4 %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 2, i32 8) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 @@ -45,7 +45,7 @@ ; GCN-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset: ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -54,7 +54,7 @@ ; GCN-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset: ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 8, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -64,7 +64,7 @@ ; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; GCN: s_endpgm -define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 1, i32 31) @@ -78,7 +78,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 0, i32 31) @@ -90,7 +90,7 @@ ; GCN: buffer_load_dword ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 ; GCN: s_endpgm -define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1) @@ -103,7 +103,7 @@ ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 31, i32 1) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -115,7 +115,7 @@ ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 1, i32 31) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -127,7 +127,7 @@ ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 8, i32 24) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -139,7 +139,7 @@ ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 24, i32 8) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -150,7 +150,7 @@ ; GCN: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = ashr i32 %x, 31 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1) @@ -161,7 +161,7 @@ ; GCN-NOT: lshr ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = lshr i32 %x, 31 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1) @@ -173,7 +173,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 0) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -184,7 +184,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 12334, i32 0, i32 0) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -195,7 +195,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 1) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -206,7 +206,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 1, i32 0, i32 1) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -217,7 +217,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 0, i32 1) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -228,7 +228,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 7, i32 1) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -239,7 +239,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 0, i32 8) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -250,7 +250,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 0, i32 8) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -261,7 +261,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 6, i32 8) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -272,7 +272,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65536, i32 16, i32 8) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -283,7 +283,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65535, i32 16, i32 16) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -294,7 +294,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -6 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 4) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -305,7 +305,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 31, i32 1) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -316,7 +316,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 131070, i32 16, i32 16) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -327,7 +327,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 2, i32 30) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -338,7 +338,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 28) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -349,7 +349,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 1, i32 7) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -360,7 +360,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 1, i32 31) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -371,7 +371,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) #0 { %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 31, i32 1) store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void @@ -383,7 +383,7 @@ ; GCN-NOT: v_ashr ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24 ; GCN: buffer_store_dword [[BFE]], -define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 0, i32 24) %shl = shl i32 %bfe, 8 @@ -399,7 +399,7 @@ ; GCN: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]] ; GCN: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]] ; GCN: buffer_store_dword [[TMP2]] -define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %src = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16) %div = sdiv i32 %bfe, 2 Index: test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll @@ -5,7 +5,7 @@ ; GCN: s_mov_b32 m0, 0 ; GCN-NOT: s_mov_b32 m0 ; GCN: s_sendmsg sendmsg(MSG_INTERRUPT) -define void @test_interrupt() { +define amdgpu_kernel void @test_interrupt() { body: call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0); ret void @@ -15,7 +15,7 @@ ; GCN: s_mov_b32 m0, 0 ; GCN-NOT: s_mov_b32 m0 ; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0) -define void @test_gs_emit() { +define amdgpu_kernel void @test_gs_emit() { body: call void @llvm.amdgcn.s.sendmsg(i32 34, i32 0); ret void @@ -25,7 +25,7 @@ ; GCN: s_mov_b32 m0, 0 ; GCN-NOT: s_mov_b32 m0 ; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1) -define void @test_gs_cut() { +define amdgpu_kernel void @test_gs_cut() { body: call void @llvm.amdgcn.s.sendmsg(i32 274, i32 0); ret void @@ -35,7 +35,7 @@ ; GCN: s_mov_b32 m0, 0 ; GCN-NOT: s_mov_b32 m0 ; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) -define void @test_gs_emit_cut() { +define amdgpu_kernel void @test_gs_emit_cut() { body: call void @llvm.amdgcn.s.sendmsg(i32 562, i32 0) ret void @@ -45,7 +45,7 @@ ; GCN: s_mov_b32 m0, 0 ; GCN-NOT: s_mov_b32 m0 ; GCN: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP) -define void @test_gs_done() { +define amdgpu_kernel void @test_gs_done() { body: call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void @@ -66,7 +66,7 @@ ; VI-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsghalt sendmsg(MSG_INTERRUPT) ; GCN-NEXT: s_endpgm -define void @sendmsghalt(i32 inreg %a) #0 { +define amdgpu_kernel void @sendmsghalt(i32 inreg %a) #0 { call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %a) ret void } @@ -75,7 +75,7 @@ ; GCN: s_mov_b32 m0, 0 ; GCN-NOT: s_mov_b32 m0 ; GCN: s_sendmsghalt sendmsg(MSG_INTERRUPT) -define void @test_interrupt_halt() { +define amdgpu_kernel void @test_interrupt_halt() { body: call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0) ret void @@ -85,7 +85,7 @@ ; GCN: s_mov_b32 m0, 0 ; GCN-NOT: s_mov_b32 m0 ; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT, 0) -define void @test_gs_emit_halt() { +define amdgpu_kernel void @test_gs_emit_halt() { body: call void @llvm.amdgcn.s.sendmsghalt(i32 34, i32 0) ret void @@ -95,7 +95,7 @@ ; GCN: s_mov_b32 m0, 0 ; GCN-NOT: s_mov_b32 m0 ; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_CUT, 1) -define void @test_gs_cut_halt() { +define amdgpu_kernel void @test_gs_cut_halt() { body: call void @llvm.amdgcn.s.sendmsghalt(i32 274, i32 0) ret void @@ -105,7 +105,7 @@ ; GCN: s_mov_b32 m0, 0 ; GCN-NOT: s_mov_b32 m0 ; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) -define void @test_gs_emit_cut_halt() { +define amdgpu_kernel void @test_gs_emit_cut_halt() { body: call void @llvm.amdgcn.s.sendmsghalt(i32 562, i32 0) ret void @@ -115,7 +115,7 @@ ; GCN: s_mov_b32 m0, 0 ; GCN-NOT: s_mov_b32 m0 ; GCN: s_sendmsghalt sendmsg(MSG_GS_DONE, GS_OP_NOP) -define void @test_gs_done_halt() { +define amdgpu_kernel void @test_gs_done_halt() { body: call void @llvm.amdgcn.s.sendmsghalt(i32 3, i32 0) ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll @@ -8,7 +8,7 @@ ; GCN: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]] ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; GCN: buffer_store_dword [[VRESULT]], -define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 { +define amdgpu_kernel void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 { %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val) store i32 %r, i32 addrspace(1)* %out, align 4 ret void @@ -18,7 +18,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], -define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { +define amdgpu_kernel void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { %val = load i32, i32 addrspace(1)* %valptr, align 4 %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val) store i32 %r, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll @@ -7,7 +7,7 @@ ; VI: v_sin_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @sin_f16( +define amdgpu_kernel void @sin_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_sin_f32: ; GCN: v_sin_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @v_sin_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @v_sin_f32(float addrspace(1)* %out, float %src) #1 { %sin = call float @llvm.amdgcn.sin.f32(float %src) #0 store float %sin, float addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll @@ -9,7 +9,7 @@ ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]] ; SI: buffer_store_dwordx2 [[RESULT]], ; SI: s_endpgm -define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load double, double addrspace(1)* %aptr, align 8 %b = load i32, i32 addrspace(1)* %bptr, align 4 %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b) nounwind readnone @@ -22,7 +22,7 @@ ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7 ; SI: buffer_store_dwordx2 [[RESULT]], ; SI: s_endpgm -define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { +define amdgpu_kernel void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { %a = load double, double addrspace(1)* %aptr, align 8 %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7) nounwind readnone store double %result, double addrspace(1)* %out, align 8 Index: test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg: ; GCN: v_bfe_u32 -define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { +define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -11,7 +11,7 @@ ; GCN-LABEL: {{^}}bfe_u32_arg_arg_imm: ; GCN: v_bfe_u32 -define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -19,7 +19,7 @@ ; GCN-LABEL: {{^}}bfe_u32_arg_imm_arg: ; GCN: v_bfe_u32 -define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { +define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -27,7 +27,7 @@ ; GCN-LABEL: {{^}}bfe_u32_imm_arg_arg: ; GCN: v_bfe_u32 -define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { +define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -36,7 +36,7 @@ ; GCN-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset: ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -45,7 +45,7 @@ ; GCN-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset: ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { +define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -55,7 +55,7 @@ ; GCN: buffer_load_ubyte ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %load = load i8, i8 addrspace(1)* %in %ext = zext i8 %load to i32 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8) @@ -70,7 +70,7 @@ ; FIXME: Should be using s_add_i32 ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 255 @@ -85,7 +85,7 @@ ; GCN-NEXT: v_and_b32_e32 ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 65535 @@ -99,7 +99,7 @@ ; GCN: v_add_i32 ; GCN: bfe ; GCN: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 255 @@ -114,7 +114,7 @@ ; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8 ; GCN-NEXT: bfe ; GCN: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 255 @@ -129,7 +129,7 @@ ; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80 ; GCN-NEXT: bfe ; GCN: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 255 @@ -143,7 +143,7 @@ ; GCN: v_add_i32 ; GCN-NEXT: bfe ; GCN: s_endpgm -define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 %ext = and i32 %add, 65535 @@ -156,14 +156,14 @@ ; GCN: buffer_load_dword ; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} ; GCN: s_endpgm -define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1) store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void } -define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 8) @@ -171,7 +171,7 @@ ret void } -define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 1) @@ -186,7 +186,7 @@ ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm -define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %shr = lshr i32 %shl, 31 @@ -201,7 +201,7 @@ ; GCN-NOT: shr ; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1 ; GCN: s_endpgm -define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %shr = ashr i32 %shl, 31 @@ -214,7 +214,7 @@ ; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; GCN: s_endpgm -define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 1, i32 31) @@ -226,7 +226,7 @@ ; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 31) @@ -239,7 +239,7 @@ ; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) @@ -252,7 +252,7 @@ ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -264,7 +264,7 @@ ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -276,7 +276,7 @@ ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -288,7 +288,7 @@ ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8) store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -299,7 +299,7 @@ ; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = ashr i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) @@ -310,7 +310,7 @@ ; GCN-NOT: lshr ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = lshr i32 %x, 31 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) @@ -323,7 +323,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -335,7 +335,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -347,7 +347,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -359,7 +359,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -371,7 +371,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -383,7 +383,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -395,7 +395,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -407,7 +407,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -419,7 +419,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -431,7 +431,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -443,7 +443,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -455,7 +455,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -467,7 +467,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -479,7 +479,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -491,7 +491,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -503,7 +503,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -515,7 +515,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -527,7 +527,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -539,7 +539,7 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm ; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 { %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void @@ -556,7 +556,7 @@ ; GCN-DAG: buffer_store_dword [[AND]] ; GCN-DAG: buffer_store_dword [[BFE]] ; GCN: s_endpgm -define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, +define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { %src = load i32, i32 addrspace(1)* %in, align 4 @@ -570,7 +570,7 @@ ; GCN-LABEL: {{^}}lshr_and: ; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 ; GCN: buffer_store_dword -define void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 { +define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 { %b = lshr i32 %a, 6 %c = and i32 %b, 7 store i32 %c, i32 addrspace(1)* %out, align 8 @@ -580,7 +580,7 @@ ; GCN-LABEL: {{^}}v_lshr_and: ; GCN: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3 ; GCN: buffer_store_dword -define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %c = lshr i32 %a, %b %d = and i32 %c, 7 store i32 %d, i32 addrspace(1)* %out, align 8 @@ -590,7 +590,7 @@ ; GCN-LABEL: {{^}}and_lshr: ; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 ; GCN: buffer_store_dword -define void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 { +define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 { %b = and i32 %a, 448 %c = lshr i32 %b, 6 store i32 %c, i32 addrspace(1)* %out, align 8 @@ -600,7 +600,7 @@ ; GCN-LABEL: {{^}}and_lshr2: ; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 ; GCN: buffer_store_dword -define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 { +define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 { %b = and i32 %a, 511 %c = lshr i32 %b, 6 store i32 %c, i32 addrspace(1)* %out, align 8 @@ -610,7 +610,7 @@ ; GCN-LABEL: {{^}}shl_lshr: ; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002 ; GCN: buffer_store_dword -define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 { +define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 { %b = shl i32 %a, 9 %c = lshr i32 %b, 11 store i32 %c, i32 addrspace(1)* %out, align 8 Index: test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll @@ -4,7 +4,7 @@ ; GCN-DAG: ; wave barrier ; GCN-NOT: s_barrier -define void @test_wave_barrier() #0 { +define amdgpu_kernel void @test_wave_barrier() #0 { entry: call void @llvm.amdgcn.wave.barrier() #1 ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll @@ -34,7 +34,7 @@ ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 -define void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %id, i32 addrspace(1)* %out ret void @@ -61,7 +61,7 @@ ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 -define void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workgroup.id.y() store i32 %id, i32 addrspace(1)* %out ret void @@ -96,7 +96,7 @@ ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 -define void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workgroup.id.z() store i32 %id, i32 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll @@ -18,7 +18,7 @@ ; ALL-NOT: v0 ; ALL: {{buffer|flat}}_store_dword {{.*}}v0 -define void @test_workitem_id_x(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_workitem_id_x(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() store i32 %id, i32 addrspace(1)* %out ret void @@ -33,7 +33,7 @@ ; ALL-NOT: v1 ; ALL: {{buffer|flat}}_store_dword {{.*}}v1 -define void @test_workitem_id_y(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_workitem_id_y(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.y() store i32 %id, i32 addrspace(1)* %out ret void @@ -48,7 +48,7 @@ ; ALL-NOT: v2 ; ALL: {{buffer|flat}}_store_dword {{.*}}v2 -define void @test_workitem_id_z(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_workitem_id_z(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.z() store i32 %id, i32 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.ceil.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -12,7 +12,7 @@ ; VI: v_ceil_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @ceil_f16( +define amdgpu_kernel void @ceil_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -37,7 +37,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @ceil_v2f16( +define amdgpu_kernel void @ceil_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.cos.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -13,7 +13,7 @@ ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @cos_f16( +define amdgpu_kernel void @cos_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -53,7 +53,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @cos_v2f16( +define amdgpu_kernel void @cos_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.cos.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.cos.ll +++ test/CodeGen/AMDGPU/llvm.cos.ll @@ -11,7 +11,7 @@ ;SI: v_cos_f32 ;SI-NOT: v_cos_f32 -define void @test(float addrspace(1)* %out, float %x) #1 { +define amdgpu_kernel void @test(float addrspace(1)* %out, float %x) #1 { %cos = call float @llvm.cos.f32(float %x) store float %cos, float addrspace(1)* %out ret void @@ -29,7 +29,7 @@ ;SI: v_cos_f32 ;SI-NOT: v_cos_f32 -define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 { +define amdgpu_kernel void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 { %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx) store <4 x float> %cos, <4 x float> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.dbg.value.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -9,7 +9,7 @@ ; CHECK: buffer_store_dword ; CHECK: s_endpgm -define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 { +define amdgpu_kernel void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 { entry: tail call void @llvm.dbg.value(metadata i32 addrspace(1)* %globalptr_arg, i64 0, metadata !10, metadata !13), !dbg !14 store i32 123, i32 addrspace(1)* %globalptr_arg, align 4 Index: test/CodeGen/AMDGPU/llvm.exp2.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.exp2.f16.ll +++ test/CodeGen/AMDGPU/llvm.exp2.f16.ll @@ -12,7 +12,7 @@ ; VI: v_exp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @exp2_f16( +define amdgpu_kernel void @exp2_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -37,7 +37,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @exp2_v2f16( +define amdgpu_kernel void @exp2_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.exp2.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.exp2.ll +++ test/CodeGen/AMDGPU/llvm.exp2.ll @@ -11,7 +11,7 @@ ;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} ;SI: v_exp_f32 -define void @test(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @test(float addrspace(1)* %out, float %in) { entry: %0 = call float @llvm.exp2.f32(float %in) store float %0, float addrspace(1)* %out @@ -34,7 +34,7 @@ ;SI: v_exp_f32 ;SI: v_exp_f32 -define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { entry: %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in) store <2 x float> %0, <2 x float> addrspace(1)* %out @@ -68,7 +68,7 @@ ;SI: v_exp_f32 ;SI: v_exp_f32 ;SI: v_exp_f32 -define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { entry: %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in) store <4 x float> %0, <4 x float> addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.floor.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -12,7 +12,7 @@ ; VI: v_floor_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @floor_f16( +define amdgpu_kernel void @floor_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -37,7 +37,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @floor_v2f16( +define amdgpu_kernel void @floor_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.fma.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -16,7 +16,7 @@ ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fma_f16( +define amdgpu_kernel void @fma_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -42,7 +42,7 @@ ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fma_f16_imm_a( +define amdgpu_kernel void @fma_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b, half addrspace(1)* %c) { @@ -65,7 +65,7 @@ ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fma_f16_imm_b( +define amdgpu_kernel void @fma_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %c) { @@ -88,7 +88,7 @@ ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fma_f16_imm_c( +define amdgpu_kernel void @fma_f16_imm_c( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -130,7 +130,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fma_v2f16( +define amdgpu_kernel void @fma_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -165,7 +165,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fma_v2f16_imm_a( +define amdgpu_kernel void @fma_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b, <2 x half> addrspace(1)* %c) { @@ -203,7 +203,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fma_v2f16_imm_b( +define amdgpu_kernel void @fma_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %c) { @@ -241,7 +241,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fma_v2f16_imm_c( +define amdgpu_kernel void @fma_v2f16_imm_c( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { Index: test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -24,7 +24,7 @@ ; VI-DENORM: buffer_store_short [[RESULT]] ; GCN: s_endpgm -define void @fmuladd_f16( +define amdgpu_kernel void @fmuladd_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -54,7 +54,7 @@ ; VI-DENORM: buffer_store_short [[RESULT]] ; GCN: s_endpgm -define void @fmuladd_f16_imm_a( +define amdgpu_kernel void @fmuladd_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b, half addrspace(1)* %c) { @@ -83,7 +83,7 @@ ; GCN: s_endpgm -define void @fmuladd_f16_imm_b( +define amdgpu_kernel void @fmuladd_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %c) { @@ -135,7 +135,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fmuladd_v2f16( +define amdgpu_kernel void @fmuladd_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, Index: test/CodeGen/AMDGPU/llvm.log2.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.log2.f16.ll +++ test/CodeGen/AMDGPU/llvm.log2.f16.ll @@ -12,7 +12,7 @@ ; VI: v_log_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @log2_f16( +define amdgpu_kernel void @log2_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -37,7 +37,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @log2_v2f16( +define amdgpu_kernel void @log2_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.log2.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.log2.ll +++ test/CodeGen/AMDGPU/llvm.log2.ll @@ -11,7 +11,7 @@ ;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ;SI: v_log_f32 -define void @test(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @test(float addrspace(1)* %out, float %in) { entry: %0 = call float @llvm.log2.f32(float %in) store float %0, float addrspace(1)* %out @@ -34,7 +34,7 @@ ;SI: v_log_f32 ;SI: v_log_f32 -define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { entry: %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in) store <2 x float> %0, <2 x float> addrspace(1)* %out @@ -68,7 +68,7 @@ ;SI: v_log_f32 ;SI: v_log_f32 ;SI: v_log_f32 -define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { entry: %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in) store <4 x float> %0, <4 x float> addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.maxnum.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -14,7 +14,7 @@ ; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @maxnum_f16( +define amdgpu_kernel void @maxnum_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -34,7 +34,7 @@ ; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @maxnum_f16_imm_a( +define amdgpu_kernel void @maxnum_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -52,7 +52,7 @@ ; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @maxnum_f16_imm_b( +define amdgpu_kernel void @maxnum_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -87,7 +87,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @maxnum_v2f16( +define amdgpu_kernel void @maxnum_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -115,7 +115,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @maxnum_v2f16_imm_a( +define amdgpu_kernel void @maxnum_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: @@ -141,7 +141,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @maxnum_v2f16_imm_b( +define amdgpu_kernel void @maxnum_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.memcpy.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.memcpy.ll +++ test/CodeGen/AMDGPU/llvm.memcpy.ll @@ -80,7 +80,7 @@ ; SI-DAG: ds_write_b8 ; SI: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind @@ -125,7 +125,7 @@ ; SI-DAG: ds_write_b16 ; SI: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind @@ -144,7 +144,7 @@ ; SI: ds_write2_b32 ; SI: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind @@ -161,7 +161,7 @@ ; SI: ds_write2_b64 ; SI-DAG: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind @@ -238,7 +238,7 @@ ; SI-DAG: buffer_store_byte ; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind @@ -281,7 +281,7 @@ ; SI-DAG: buffer_store_short ; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind @@ -294,7 +294,7 @@ ; SI: buffer_store_dwordx4 ; SI: buffer_store_dwordx4 ; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind @@ -307,7 +307,7 @@ ; SI: buffer_store_dwordx4 ; SI: buffer_store_dwordx4 ; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind @@ -320,7 +320,7 @@ ; SI: buffer_store_dwordx4 ; SI: buffer_store_dwordx4 ; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind @@ -340,7 +340,7 @@ ; SI-DAG: s_load_dwordx2 ; SI-DAG: buffer_store_dwordx4 ; SI-DAG: buffer_store_dwordx4 -define void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind { +define amdgpu_kernel void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind { %str = bitcast [16 x i8] addrspace(2)* @hello.align4 to i8 addrspace(2)* call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i32 4, i1 false) ret void @@ -365,7 +365,7 @@ ; SI: buffer_store_byte ; SI: buffer_store_byte ; SI: buffer_store_byte -define void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind { +define amdgpu_kernel void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind { %str = bitcast [16 x i8] addrspace(2)* @hello.align1 to i8 addrspace(2)* call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i32 1, i1 false) ret void Index: test/CodeGen/AMDGPU/llvm.minnum.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -14,7 +14,7 @@ ; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @minnum_f16( +define amdgpu_kernel void @minnum_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -34,7 +34,7 @@ ; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @minnum_f16_imm_a( +define amdgpu_kernel void @minnum_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -52,7 +52,7 @@ ; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @minnum_f16_imm_b( +define amdgpu_kernel void @minnum_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -86,7 +86,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @minnum_v2f16( +define amdgpu_kernel void @minnum_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -117,7 +117,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @minnum_v2f16_imm_a( +define amdgpu_kernel void @minnum_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: @@ -143,7 +143,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @minnum_v2f16_imm_b( +define amdgpu_kernel void @minnum_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.r600.dot4.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.r600.dot4.ll +++ test/CodeGen/AMDGPU/llvm.r600.dot4.ll @@ -2,7 +2,7 @@ declare float @llvm.r600.dot4(<4 x float>, <4 x float>) nounwind readnone -define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind { +define amdgpu_kernel void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind { %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16 %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16 %dp4 = call float @llvm.r600.dot4(<4 x float> %src0, <4 x float> %src1) nounwind readnone Index: test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll +++ test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll @@ -2,7 +2,7 @@ ; EG-LABEL: {{^}}test_group_barrier: ; EG: GROUP_BARRIER -define void @test_group_barrier(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_group_barrier(i32 addrspace(1)* %out) #0 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp Index: test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll +++ test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -14,7 +14,7 @@ ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @local_size_x(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.local.size.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -29,7 +29,7 @@ ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @local_size_y(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.local.size.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -44,7 +44,7 @@ ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @local_size_z(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.local.size.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -59,7 +59,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]] ; GCN: buffer_store_dword [[VAL]] -define void @local_size_xy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_xy(i32 addrspace(1)* %out) { entry: %x = call i32 @llvm.r600.read.local.size.x() #0 %y = call i32 @llvm.r600.read.local.size.y() #0 @@ -78,7 +78,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]] ; GCN: buffer_store_dword [[VAL]] -define void @local_size_xz(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_xz(i32 addrspace(1)* %out) { entry: %x = call i32 @llvm.r600.read.local.size.x() #0 %z = call i32 @llvm.r600.read.local.size.z() #0 @@ -98,7 +98,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]] ; GCN: buffer_store_dword [[VAL]] -define void @local_size_yz(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_yz(i32 addrspace(1)* %out) { entry: %y = call i32 @llvm.r600.read.local.size.y() #0 %z = call i32 @llvm.r600.read.local.size.z() #0 @@ -121,7 +121,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]] ; GCN: buffer_store_dword [[VAL]] -define void @local_size_xyz(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_xyz(i32 addrspace(1)* %out) { entry: %x = call i32 @llvm.r600.read.local.size.x() #0 %y = call i32 @llvm.r600.read.local.size.y() #0 @@ -138,7 +138,7 @@ ; GCN-NOT: 0xffff ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_x_known_bits(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_x_known_bits(i32 addrspace(1)* %out) { entry: %size = call i32 @llvm.r600.read.local.size.x() #0 %shl = shl i32 %size, 16 @@ -153,7 +153,7 @@ ; GCN-NOT: 0xffff ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_y_known_bits(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_y_known_bits(i32 addrspace(1)* %out) { entry: %size = call i32 @llvm.r600.read.local.size.y() #0 %shl = shl i32 %size, 16 @@ -168,7 +168,7 @@ ; GCN-NOT: 0xffff ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_z_known_bits(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_z_known_bits(i32 addrspace(1)* %out) { entry: %size = call i32 @llvm.r600.read.local.size.z() #0 %shl = shl i32 %size, 16 Index: test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll +++ test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll @@ -4,7 +4,7 @@ ; EG-LABEL: {{^}}rsq_clamped_f32: ; EG: RECIPSQRT_CLAMPED -define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind { +define amdgpu_kernel void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind { %rsq_clamped = call float @llvm.r600.recipsqrt.clamped.f32(float %src) store float %rsq_clamped, float addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll +++ test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll @@ -4,7 +4,7 @@ ; EG-LABEL: {{^}}recipsqrt.ieee_f32: ; EG: RECIPSQRT_IEEE -define void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind { +define amdgpu_kernel void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind { %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float %src) nounwind readnone store float %recipsqrt.ieee, float addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ ; TODO: Really these should be constant folded ; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_4.0 ; EG: RECIPSQRT_IEEE -define void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind { %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 4.0) nounwind readnone store float %recipsqrt.ieee, float addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ ; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_100.0 ; EG: RECIPSQRT_IEEE -define void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind { %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 100.0) nounwind readnone store float %recipsqrt.ieee, float addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/llvm.r600.tex.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.r600.tex.ll +++ test/CodeGen/AMDGPU/llvm.r600.tex.ll @@ -17,7 +17,7 @@ ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN -define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +define amdgpu_kernel void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { bb: %addr = load <4 x float>, <4 x float> addrspace(1)* %in %tmp = shufflevector <4 x float> %addr, <4 x float> %addr, <4 x i32> Index: test/CodeGen/AMDGPU/llvm.rint.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -12,7 +12,7 @@ ; VI: v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @rint_f16( +define amdgpu_kernel void @rint_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -37,7 +37,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @rint_v2f16( +define amdgpu_kernel void @rint_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.rint.f64.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.rint.f64.ll +++ test/CodeGen/AMDGPU/llvm.rint.f64.ll @@ -11,7 +11,7 @@ ; SI: v_cndmask_b32 ; SI: v_cndmask_b32 ; SI: s_endpgm -define void @rint_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @rint_f64(double addrspace(1)* %out, double %in) { entry: %0 = call double @llvm.rint.f64(double %in) store double %0, double addrspace(1)* %out @@ -21,7 +21,7 @@ ; FUNC-LABEL: {{^}}rint_v2f64: ; CI: v_rndne_f64_e32 ; CI: v_rndne_f64_e32 -define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { entry: %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in) store <2 x double> %0, <2 x double> addrspace(1)* %out @@ -33,7 +33,7 @@ ; CI: v_rndne_f64_e32 ; CI: v_rndne_f64_e32 ; CI: v_rndne_f64_e32 -define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { entry: %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in) store <4 x double> %0, <4 x double> addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.rint.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.rint.ll +++ test/CodeGen/AMDGPU/llvm.rint.ll @@ -6,7 +6,7 @@ ; R600: RNDNE ; SI: v_rndne_f32_e32 -define void @rint_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @rint_f32(float addrspace(1)* %out, float %in) { entry: %0 = call float @llvm.rint.f32(float %in) #0 store float %0, float addrspace(1)* %out @@ -19,7 +19,7 @@ ; SI: v_rndne_f32_e32 ; SI: v_rndne_f32_e32 -define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { entry: %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0 store <2 x float> %0, <2 x float> addrspace(1)* %out @@ -36,7 +36,7 @@ ; SI: v_rndne_f32_e32 ; SI: v_rndne_f32_e32 ; SI: v_rndne_f32_e32 -define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { entry: %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0 store <4 x float> %0, <4 x float> addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.round.f64.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.round.f64.ll +++ test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -2,7 +2,7 @@ ; FUNC-LABEL: {{^}}round_f64: ; SI: s_endpgm -define void @round_f64(double addrspace(1)* %out, double %x) #0 { +define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 { %result = call double @llvm.round.f64(double %x) #1 store double %result, double addrspace(1)* %out ret void @@ -26,7 +26,7 @@ ; SI: buffer_store_dwordx2 ; SI: s_endpgm -define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid @@ -38,7 +38,7 @@ ; FUNC-LABEL: {{^}}round_v2f64: ; SI: s_endpgm -define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { +define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1 store <2 x double> %result, <2 x double> addrspace(1)* %out ret void @@ -46,7 +46,7 @@ ; FUNC-LABEL: {{^}}round_v4f64: ; SI: s_endpgm -define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { +define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 store <4 x double> %result, <4 x double> addrspace(1)* %out ret void @@ -54,7 +54,7 @@ ; FUNC-LABEL: {{^}}round_v8f64: ; SI: s_endpgm -define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { +define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 store <8 x double> %result, <8 x double> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.round.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.round.ll +++ test/CodeGen/AMDGPU/llvm.round.ll @@ -20,7 +20,7 @@ ; R600-DAG: SETGE ; R600-DAG: CNDE ; R600-DAG: ADD -define void @round_f32(float addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @round_f32(float addrspace(1)* %out, float %x) #0 { %result = call float @llvm.round.f32(float %x) #1 store float %result, float addrspace(1)* %out ret void @@ -34,7 +34,7 @@ ; FUNC-LABEL: {{^}}round_v2f32: ; SI: s_endpgm ; R600: CF_END -define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 { +define amdgpu_kernel void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 { %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1 store <2 x float> %result, <2 x float> addrspace(1)* %out ret void @@ -43,7 +43,7 @@ ; FUNC-LABEL: {{^}}round_v4f32: ; SI: s_endpgm ; R600: CF_END -define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 { +define amdgpu_kernel void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 { %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1 store <4 x float> %result, <4 x float> addrspace(1)* %out ret void @@ -52,7 +52,7 @@ ; FUNC-LABEL: {{^}}round_v8f32: ; SI: s_endpgm ; R600: CF_END -define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 { +define amdgpu_kernel void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 { %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1 store <8 x float> %result, <8 x float> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.sin.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -13,7 +13,7 @@ ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @sin_f16( +define amdgpu_kernel void @sin_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -52,7 +52,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @sin_v2f16( +define amdgpu_kernel void @sin_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.sin.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.sin.ll +++ test/CodeGen/AMDGPU/llvm.sin.ll @@ -12,7 +12,7 @@ ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @sin_f32(float addrspace(1)* %out, float %x) #1 { +define amdgpu_kernel void @sin_f32(float addrspace(1)* %out, float %x) #1 { %sin = call float @llvm.sin.f32(float %x) store float %sin, float addrspace(1)* %out ret void @@ -24,7 +24,7 @@ ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 { +define amdgpu_kernel void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 { %y = fmul float 3.0, %x %sin = call float @llvm.sin.f32(float %y) store float %sin, float addrspace(1)* %out @@ -38,7 +38,7 @@ ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 { +define amdgpu_kernel void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 { %y = fmul float 3.0, %x %sin = call float @llvm.sin.f32(float %y) store float %sin, float addrspace(1)* %out @@ -51,7 +51,7 @@ ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 { +define amdgpu_kernel void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 { %y = fmul float 2.0, %x %sin = call float @llvm.sin.f32(float %y) store float %sin, float addrspace(1)* %out @@ -65,7 +65,7 @@ ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 { +define amdgpu_kernel void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 { %y = fmul float 2.0, %x %sin = call float @llvm.sin.f32(float %y) store float %sin, float addrspace(1)* %out @@ -78,7 +78,7 @@ ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 { +define amdgpu_kernel void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 { %y = fmul float 2.0, %x %sin = call float @llvm.sin.f32(float %y) store float %sin, float addrspace(1)* %out @@ -91,7 +91,7 @@ ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 { +define amdgpu_kernel void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 { %y = fmul float 2.0, %x %sin = call float @llvm.sin.f32(float %y) store float %sin, float addrspace(1)* %out @@ -110,7 +110,7 @@ ; SI: v_sin_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 { +define amdgpu_kernel void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 { %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx) store <4 x float> %sin, <4 x float> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/llvm.sqrt.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -12,7 +12,7 @@ ; VI: v_sqrt_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @sqrt_f16( +define amdgpu_kernel void @sqrt_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -37,7 +37,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @sqrt_v2f16( +define amdgpu_kernel void @sqrt_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/llvm.trunc.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -12,7 +12,7 @@ ; VI: v_trunc_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @trunc_f16( +define amdgpu_kernel void @trunc_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -37,7 +37,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @trunc_v2f16( +define amdgpu_kernel void @trunc_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/load-constant-f64.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-f64.ll +++ test/CodeGen/AMDGPU/load-constant-f64.ll @@ -6,7 +6,7 @@ ; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}] ; GCN-NOHSA: buffer_store_dwordx2 ; GCN-HSA: flat_store_dwordx2 -define void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 { %ld = load double, double addrspace(2)* %in store double %ld, double addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/load-constant-i1.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i1.ll +++ test/CodeGen/AMDGPU/load-constant-i1.ll @@ -9,56 +9,56 @@ ; EG: VTX_READ_8 ; EG: AND_INT -define void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { %load = load i1, i1 addrspace(2)* %in store i1 %load, i1 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v2i1: -define void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(2)* %in store <2 x i1> %load, <2 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v3i1: -define void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(2)* %in store <3 x i1> %load, <3 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v4i1: -define void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(2)* %in store <4 x i1> %load, <4 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v8i1: -define void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(2)* %in store <8 x i1> %load, <8 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v16i1: -define void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(2)* %in store <16 x i1> %load, <16 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v32i1: -define void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(2)* %in store <32 x i1> %load, <32 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v64i1: -define void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(2)* %in store <64 x i1> %load, <64 x i1> addrspace(1)* %out ret void @@ -67,7 +67,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32: ; GCN: buffer_load_ubyte ; GCN: buffer_store_dword -define void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { %a = load i1, i1 addrspace(2)* %in %ext = zext i1 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -81,7 +81,7 @@ ; EG: VTX_READ_8 ; EG: BFE_INT -define void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { %a = load i1, i1 addrspace(2)* %in %ext = sext i1 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -89,7 +89,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i32: -define void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(2)* %in %ext = zext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -97,7 +97,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i32: -define void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(2)* %in %ext = sext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -105,7 +105,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i32: -define void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(2)* %in %ext = zext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -113,7 +113,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i32: -define void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(2)* %in %ext = sext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -121,7 +121,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i32: -define void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(2)* %in %ext = zext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out @@ -129,7 +129,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i32: -define void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(2)* %in %ext = sext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out @@ -137,7 +137,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i32: -define void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(2)* %in %ext = zext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -145,7 +145,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i32: -define void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(2)* %in %ext = sext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -153,7 +153,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i32: -define void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(2)* %in %ext = zext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -161,7 +161,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i32: -define void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(2)* %in %ext = sext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -169,7 +169,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i32: -define void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(2)* %in %ext = zext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -177,7 +177,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i32: -define void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(2)* %in %ext = sext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -185,7 +185,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i32: -define void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(2)* %in %ext = zext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -193,7 +193,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i32: -define void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(2)* %in %ext = sext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -201,7 +201,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i32: -define void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(2)* %in %ext = zext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -209,7 +209,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i32: -define void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(2)* %in %ext = sext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -221,7 +221,7 @@ ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]] ; GCN: buffer_store_dwordx2 -define void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { %a = load i1, i1 addrspace(2)* %in %ext = zext i1 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -233,7 +233,7 @@ ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}} ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]] ; GCN: buffer_store_dwordx2 -define void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { %a = load i1, i1 addrspace(2)* %in %ext = sext i1 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -241,7 +241,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i64: -define void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(2)* %in %ext = zext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -249,7 +249,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i64: -define void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(2)* %in %ext = sext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -257,7 +257,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i64: -define void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(2)* %in %ext = zext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -265,7 +265,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i64: -define void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(2)* %in %ext = sext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -273,7 +273,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i64: -define void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(2)* %in %ext = zext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(1)* %out @@ -281,7 +281,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i64: -define void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(2)* %in %ext = sext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(1)* %out @@ -289,7 +289,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i64: -define void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(2)* %in %ext = zext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -297,7 +297,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i64: -define void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(2)* %in %ext = sext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -305,7 +305,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i64: -define void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(2)* %in %ext = zext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -313,7 +313,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i64: -define void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(2)* %in %ext = sext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -321,7 +321,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i64: -define void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(2)* %in %ext = zext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -329,7 +329,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i64: -define void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(2)* %in %ext = sext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -337,7 +337,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i64: -define void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(2)* %in %ext = zext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -345,7 +345,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i64: -define void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(2)* %in %ext = sext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -353,7 +353,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i64: -define void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(2)* %in %ext = zext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -361,7 +361,7 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i64: -define void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(2)* %in %ext = sext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out Index: test/CodeGen/AMDGPU/load-constant-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i16.ll +++ test/CodeGen/AMDGPU/load-constant-i16.ll @@ -8,7 +8,7 @@ ; GCN-HSA: flat_load_ushort ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) { entry: %ld = load i16, i16 addrspace(2)* %in store i16 %ld, i16 addrspace(1)* %out @@ -19,7 +19,7 @@ ; GCN: s_load_dword s ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) { entry: %ld = load <2 x i16>, <2 x i16> addrspace(2)* %in store <2 x i16> %ld, <2 x i16> addrspace(1)* %out @@ -31,7 +31,7 @@ ; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 -define void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in store <3 x i16> %ld, <3 x i16> addrspace(1)* %out @@ -42,7 +42,7 @@ ; GCN: s_load_dwordx2 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) { entry: %ld = load <4 x i16>, <4 x i16> addrspace(2)* %in store <4 x i16> %ld, <4 x i16> addrspace(1)* %out @@ -53,7 +53,7 @@ ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) { entry: %ld = load <8 x i16>, <8 x i16> addrspace(2)* %in store <8 x i16> %ld, <8 x i16> addrspace(1)* %out @@ -65,7 +65,7 @@ ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) { entry: %ld = load <16 x i16>, <16 x i16> addrspace(2)* %in store <16 x i16> %ld, <16 x i16> addrspace(1)* %out @@ -80,7 +80,7 @@ ; GCN-HSA: flat_store_dword ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1 -define void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { %a = load i16, i16 addrspace(2)* %in %ext = zext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -97,7 +97,7 @@ ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 16 -define void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { %a = load i16, i16 addrspace(2)* %in %ext = sext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -109,7 +109,7 @@ ; GCN-HSA: flat_load_ushort ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1 -define void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(2)* %in %ext = zext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -123,7 +123,7 @@ ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 16 -define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(2)* %in %ext = sext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -140,7 +140,7 @@ ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal ; EG: 16 ; EG: 16 -define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in %ext = zext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -160,7 +160,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 -define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in %ext = sext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -183,7 +183,7 @@ ; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal ; EG-DAG: 65535 ; EG-DAG: 65535 -define void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in %ext = zext <3 x i16> %ld to <3 x i32> @@ -204,7 +204,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 -define void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in %ext = sext <3 x i16> %ld to <3 x i32> @@ -229,7 +229,7 @@ ; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal ; EG-DAG: 65535 ; EG-DAG: 65535 -define void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(2)* %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -254,7 +254,7 @@ ; EG-DAG: 16 ; EG-DAG: 16 ; EG-DAG: 16 -define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(2)* %in %ext = sext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -288,7 +288,7 @@ ; EG-DAG: 65535 ; EG-DAG: 65535 ; EG-DAG: 65535 -define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(2)* %in %ext = zext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -322,7 +322,7 @@ ; EG-DAG: 16 ; EG-DAG: 16 ; EG-DAG: 16 -define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(2)* %in %ext = sext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -337,7 +337,7 @@ ; v16i16 is naturally 32 byte aligned ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 0, #1 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 16, #1 -define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(2)* %in %ext = zext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -352,7 +352,7 @@ ; v16i16 is naturally 32 byte aligned ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 0, #1 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 16, #1 -define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(2)* %in %ext = sext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -369,7 +369,7 @@ ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 -define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(2)* %in %ext = zext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -385,7 +385,7 @@ ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 -define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(2)* %in %ext = sext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -404,7 +404,7 @@ ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1 -define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(2)* %in %ext = zext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -421,7 +421,7 @@ ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1 -define void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(2)* %in %ext = sext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -438,7 +438,7 @@ ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { %a = load i16, i16 addrspace(2)* %in %ext = zext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -464,7 +464,7 @@ ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: These could be expanded earlier using ASHR 15 ; EG: 31 -define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { %a = load i16, i16 addrspace(2)* %in %ext = sext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -475,7 +475,7 @@ ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(2)* %in %ext = zext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -488,7 +488,7 @@ ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: These could be expanded earlier using ASHR 15 ; EG: 31 -define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(2)* %in %ext = sext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -498,7 +498,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in %ext = zext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -508,7 +508,7 @@ ; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in %ext = sext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -518,7 +518,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(2)* %in %ext = zext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -528,7 +528,7 @@ ; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(2)* %in %ext = sext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -538,7 +538,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(2)* %in %ext = zext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -548,7 +548,7 @@ ; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(2)* %in %ext = sext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -559,7 +559,7 @@ ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(2)* %in %ext = zext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -570,7 +570,7 @@ ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(2)* %in %ext = sext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -583,7 +583,7 @@ ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 -define void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(2)* %in %ext = zext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -596,7 +596,7 @@ ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 -define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(2)* %in %ext = sext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -606,7 +606,7 @@ ; These trigger undefined register machine verifier errors ; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64: -; define void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { +; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { ; %load = load <64 x i16>, <64 x i16> addrspace(2)* %in ; %ext = zext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -614,7 +614,7 @@ ; } ; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64: -; define void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { +; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { ; %load = load <64 x i16>, <64 x i16> addrspace(2)* %in ; %ext = sext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out Index: test/CodeGen/AMDGPU/load-constant-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i32.ll +++ test/CodeGen/AMDGPU/load-constant-i32.ll @@ -7,7 +7,7 @@ ; GCN: s_load_dword s{{[0-9]+}} ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 -define void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { entry: %ld = load i32, i32 addrspace(2)* %in store i32 %ld, i32 addrspace(1)* %out @@ -18,7 +18,7 @@ ; GCN: s_load_dwordx2 ; EG: VTX_READ_64 -define void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { entry: %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in store <2 x i32> %ld, <2 x i32> addrspace(1)* %out @@ -29,7 +29,7 @@ ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 -define void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 { entry: %ld = load <3 x i32>, <3 x i32> addrspace(2)* %in store <3 x i32> %ld, <3 x i32> addrspace(1)* %out @@ -40,7 +40,7 @@ ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 -define void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { entry: %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in store <4 x i32> %ld, <4 x i32> addrspace(1)* %out @@ -52,7 +52,7 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { entry: %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in store <8 x i32> %ld, <8 x i32> addrspace(1)* %out @@ -66,7 +66,7 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { entry: %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in store <16 x i32> %ld, <16 x i32> addrspace(1)* %out @@ -81,7 +81,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY ; EG: CF_END ; EG: VTX_READ_32 -define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { %ld = load i32, i32 addrspace(2)* %in %ext = zext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out @@ -98,7 +98,7 @@ ; EG: VTX_READ_32 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal. ; EG: 31 -define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { %ld = load i32, i32 addrspace(2)* %in %ext = sext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out @@ -108,7 +108,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64: ; GCN: s_load_dword ; GCN: store_dwordx2 -define void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in %ext = zext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -119,7 +119,7 @@ ; GCN: s_load_dword s[[LO:[0-9]+]] ; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31 ; GCN: store_dwordx2 -define void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in %ext = sext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -129,7 +129,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64: ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: store_dwordx4 -define void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in %ext = zext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -143,7 +143,7 @@ ; GCN-DAG: s_ashr_i32 ; GCN: store_dwordx4 -define void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in %ext = sext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -155,7 +155,7 @@ ; GCN: store_dwordx4 ; GCN: store_dwordx4 -define void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in %ext = zext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -172,7 +172,7 @@ ; GCN: store_dwordx4 ; GCN: store_dwordx4 -define void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in %ext = sext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -191,7 +191,7 @@ ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-SA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in %ext = zext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -219,7 +219,7 @@ ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in %ext = sext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -240,7 +240,7 @@ ; GCN: store_dwordx4 ; GCN: store_dwordx4 ; GCN: store_dwordx4 -define void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in %ext = sext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -267,7 +267,7 @@ ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 -define void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in %ext = zext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -319,7 +319,7 @@ ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in %ext = sext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -370,7 +370,7 @@ ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in %ext = zext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out Index: test/CodeGen/AMDGPU/load-constant-i64.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i64.ll +++ test/CodeGen/AMDGPU/load-constant-i64.ll @@ -7,7 +7,7 @@ ; FUNC-LABEL: {{^}}constant_load_i64: ; GCN: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; EG: VTX_READ_64 -define void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 { %ld = load i64, i64 addrspace(2)* %in store i64 %ld, i64 addrspace(1)* %out ret void @@ -17,7 +17,7 @@ ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 -define void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 { entry: %ld = load <2 x i64>, <2 x i64> addrspace(2)* %in store <2 x i64> %ld, <2 x i64> addrspace(1)* %out @@ -29,7 +29,7 @@ ; EG-DAG: VTX_READ_128 ; EG-DAG: VTX_READ_128 -define void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 { entry: %ld = load <3 x i64>, <3 x i64> addrspace(2)* %in store <3 x i64> %ld, <3 x i64> addrspace(1)* %out @@ -41,7 +41,7 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 { entry: %ld = load <4 x i64>, <4 x i64> addrspace(2)* %in store <4 x i64> %ld, <4 x i64> addrspace(1)* %out @@ -55,7 +55,7 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 { entry: %ld = load <8 x i64>, <8 x i64> addrspace(2)* %in store <8 x i64> %ld, <8 x i64> addrspace(1)* %out @@ -74,7 +74,7 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 { entry: %ld = load <16 x i64>, <16 x i64> addrspace(2)* %in store <16 x i64> %ld, <16 x i64> addrspace(1)* %out Index: test/CodeGen/AMDGPU/load-constant-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i8.ll +++ test/CodeGen/AMDGPU/load-constant-i8.ll @@ -10,7 +10,7 @@ ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; TODO: NOT AND -define void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { entry: %ld = load i8, i8 addrspace(2)* %in store i8 %ld, i8 addrspace(1)* %out @@ -22,7 +22,7 @@ ; GCN-HSA: flat_load_ushort v ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { entry: %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in store <2 x i8> %ld, <2 x i8> addrspace(1)* %out @@ -33,7 +33,7 @@ ; GCN: s_load_dword s ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in store <3 x i8> %ld, <3 x i8> addrspace(1)* %out @@ -44,7 +44,7 @@ ; GCN: s_load_dword s ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { entry: %ld = load <4 x i8>, <4 x i8> addrspace(2)* %in store <4 x i8> %ld, <4 x i8> addrspace(1)* %out @@ -55,7 +55,7 @@ ; GCN: s_load_dwordx2 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { entry: %ld = load <8 x i8>, <8 x i8> addrspace(2)* %in store <8 x i8> %ld, <8 x i8> addrspace(1)* %out @@ -66,7 +66,7 @@ ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { entry: %ld = load <16 x i8>, <16 x i8> addrspace(2)* %in store <16 x i8> %ld, <16 x i8> addrspace(1)* %out @@ -78,7 +78,7 @@ ; GCN-HSA: flat_load_ubyte ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { %a = load i8, i8 addrspace(2)* %in %ext = zext i8 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -92,7 +92,7 @@ ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 8 -define void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { %ld = load i8, i8 addrspace(2)* %in %ext = sext i8 %ld to i32 store i32 %ext, i32 addrspace(1)* %out @@ -102,7 +102,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i32: ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(2)* %in %ext = zext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -114,7 +114,7 @@ ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 8 -define void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(2)* %in %ext = sext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -129,7 +129,7 @@ ; TODO: This should use DST, but for some there are redundant MOVs ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal ; EG: 8 -define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(2)* %in %ext = zext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -150,7 +150,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(2)* %in %ext = sext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -170,7 +170,7 @@ ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in %ext = zext <3 x i8> %ld to <3 x i32> @@ -193,7 +193,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in %ext = sext <3 x i8> %ld to <3 x i32> @@ -214,7 +214,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(2)* %in %ext = zext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -236,7 +236,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(2)* %in %ext = sext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -264,7 +264,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = zext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -294,7 +294,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = sext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -335,7 +335,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(2)* %in %ext = zext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -378,7 +378,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(2)* %in %ext = sext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -450,7 +450,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(2)* %in %ext = zext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -526,7 +526,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(2)* %in %ext = sext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -539,7 +539,7 @@ ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1 -define void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { %load = load <64 x i8>, <64 x i8> addrspace(2)* %in %ext = zext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -552,7 +552,7 @@ ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1 -define void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { %load = load <64 x i8>, <64 x i8> addrspace(2)* %in %ext = sext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -570,7 +570,7 @@ ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { %a = load i8, i8 addrspace(2)* %in %ext = zext i8 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -589,7 +589,7 @@ ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: Why not 7 ? ; EG: 31 -define void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { %a = load i8, i8 addrspace(2)* %in %ext = sext i8 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -600,7 +600,7 @@ ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(2)* %in %ext = zext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -613,7 +613,7 @@ ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: Why not 7 ? ; EG: 31 -define void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(2)* %in %ext = sext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -623,7 +623,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i64: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(2)* %in %ext = zext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -633,7 +633,7 @@ ; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i64: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(2)* %in %ext = sext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -643,7 +643,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(2)* %in %ext = zext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -653,7 +653,7 @@ ; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(2)* %in %ext = sext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -663,7 +663,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = zext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -673,7 +673,7 @@ ; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = sext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -683,7 +683,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(2)* %in %ext = zext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -693,7 +693,7 @@ ; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(2)* %in %ext = sext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -704,7 +704,7 @@ ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(2)* %in %ext = zext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -715,7 +715,7 @@ ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(2)* %in %ext = sext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -723,7 +723,7 @@ } ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i64: -; define void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { +; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in ; %ext = zext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -731,7 +731,7 @@ ; } ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i64: -; define void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { +; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in ; %ext = sext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -744,7 +744,7 @@ ; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]], ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]] -define void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { %a = load i8, i8 addrspace(2)* %in %ext = zext i8 %a to i16 store i16 %ext, i16 addrspace(1)* %out @@ -759,7 +759,7 @@ ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]] ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { %a = load i8, i8 addrspace(2)* %in %ext = sext i8 %a to i16 store i16 %ext, i16 addrspace(1)* %out @@ -767,7 +767,7 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i16: -define void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(2)* %in %ext = zext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(1)* %out @@ -778,7 +778,7 @@ ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(2)* %in %ext = sext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(1)* %out @@ -788,7 +788,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i16: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(2)* %in %ext = zext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(1)* %out @@ -800,7 +800,7 @@ ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(2)* %in %ext = sext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(1)* %out @@ -810,7 +810,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i16: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(2)* %in %ext = zext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(1)* %out @@ -824,7 +824,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(2)* %in %ext = sext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(1)* %out @@ -834,7 +834,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i16: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = zext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(1)* %out @@ -853,7 +853,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = sext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(1)* %out @@ -863,7 +863,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i16: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(2)* %in %ext = zext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(1)* %out @@ -889,7 +889,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(2)* %in %ext = sext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(1)* %out @@ -900,7 +900,7 @@ ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(2)* %in %ext = zext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(1)* %out @@ -943,7 +943,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(2)* %in %ext = sext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(1)* %out @@ -951,7 +951,7 @@ } ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i16: -; define void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { +; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in ; %ext = zext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out @@ -959,7 +959,7 @@ ; } ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i16: -; define void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { +; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in ; %ext = sext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out Index: test/CodeGen/AMDGPU/load-global-f32.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-f32.ll +++ test/CodeGen/AMDGPU/load-global-f32.ll @@ -10,7 +10,7 @@ ; GCN-HSA: flat_load_dword ; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 -define void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %tmp0 = load float, float addrspace(1)* %in store float %tmp0, float addrspace(1)* %out @@ -22,7 +22,7 @@ ; GCN-HSA: flat_load_dwordx2 ; R600: VTX_READ_64 -define void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { entry: %tmp0 = load <2 x float>, <2 x float> addrspace(1)* %in store <2 x float> %tmp0, <2 x float> addrspace(1)* %out @@ -34,7 +34,7 @@ ; GCN-HSA: flat_load_dwordx4 ; R600: VTX_READ_128 -define void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { entry: %tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in store <3 x float> %tmp0, <3 x float> addrspace(1)* %out @@ -46,7 +46,7 @@ ; GCN-HSA: flat_load_dwordx4 ; R600: VTX_READ_128 -define void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { entry: %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %in store <4 x float> %tmp0, <4 x float> addrspace(1)* %out @@ -61,7 +61,7 @@ ; R600: VTX_READ_128 ; R600: VTX_READ_128 -define void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { entry: %tmp0 = load <8 x float>, <8 x float> addrspace(1)* %in store <8 x float> %tmp0, <8 x float> addrspace(1)* %out @@ -83,7 +83,7 @@ ; R600: VTX_READ_128 ; R600: VTX_READ_128 ; R600: VTX_READ_128 -define void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { entry: %tmp0 = load <16 x float>, <16 x float> addrspace(1)* %in store <16 x float> %tmp0, <16 x float> addrspace(1)* %out Index: test/CodeGen/AMDGPU/load-global-f64.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-f64.ll +++ test/CodeGen/AMDGPU/load-global-f64.ll @@ -8,7 +8,7 @@ ; GCN-HSA: flat_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]] -define void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %ld = load double, double addrspace(1)* %in store double %ld, double addrspace(1)* %out ret void @@ -17,7 +17,7 @@ ; FUNC-LABEL: {{^}}global_load_v2f64: ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { entry: %ld = load <2 x double>, <2 x double> addrspace(1)* %in store <2 x double> %ld, <2 x double> addrspace(1)* %out @@ -29,7 +29,7 @@ ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { entry: %ld = load <3 x double>, <3 x double> addrspace(1)* %in store <3 x double> %ld, <3 x double> addrspace(1)* %out @@ -42,7 +42,7 @@ ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { entry: %ld = load <4 x double>, <4 x double> addrspace(1)* %in store <4 x double> %ld, <4 x double> addrspace(1)* %out @@ -59,7 +59,7 @@ ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { entry: %ld = load <8 x double>, <8 x double> addrspace(1)* %in store <8 x double> %ld, <8 x double> addrspace(1)* %out @@ -84,7 +84,7 @@ ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { entry: %ld = load <16 x double>, <16 x double> addrspace(1)* %in store <16 x double> %ld, <16 x double> addrspace(1)* %out Index: test/CodeGen/AMDGPU/load-global-i1.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i1.ll +++ test/CodeGen/AMDGPU/load-global-i1.ll @@ -9,56 +9,56 @@ ; EG: VTX_READ_8 ; EG: AND_INT -define void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %load = load i1, i1 addrspace(1)* %in store i1 %load, i1 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v2i1: -define void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(1)* %in store <2 x i1> %load, <2 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v3i1: -define void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(1)* %in store <3 x i1> %load, <3 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v4i1: -define void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(1)* %in store <4 x i1> %load, <4 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v8i1: -define void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(1)* %in store <8 x i1> %load, <8 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v16i1: -define void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(1)* %in store <16 x i1> %load, <16 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v32i1: -define void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(1)* %in store <32 x i1> %load, <32 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v64i1: -define void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(1)* %in store <64 x i1> %load, <64 x i1> addrspace(1)* %out ret void @@ -67,7 +67,7 @@ ; FUNC-LABEL: {{^}}global_zextload_i1_to_i32: ; GCN: buffer_load_ubyte ; GCN: buffer_store_dword -define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %a = load i1, i1 addrspace(1)* %in %ext = zext i1 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -81,7 +81,7 @@ ; EG: VTX_READ_8 ; EG: BFE_INT -define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %a = load i1, i1 addrspace(1)* %in %ext = sext i1 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -89,7 +89,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i32: -define void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(1)* %in %ext = zext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -97,7 +97,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i32: -define void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(1)* %in %ext = sext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -105,7 +105,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i32: -define void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(1)* %in %ext = zext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -113,7 +113,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i32: -define void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(1)* %in %ext = sext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -121,7 +121,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i32: -define void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(1)* %in %ext = zext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out @@ -129,7 +129,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i32: -define void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(1)* %in %ext = sext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out @@ -137,7 +137,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i32: -define void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(1)* %in %ext = zext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -145,7 +145,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i32: -define void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(1)* %in %ext = sext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -153,7 +153,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i32: -define void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(1)* %in %ext = zext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -161,7 +161,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i32: -define void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(1)* %in %ext = sext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -169,7 +169,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i32: -define void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(1)* %in %ext = zext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -177,7 +177,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i32: -define void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(1)* %in %ext = sext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -185,7 +185,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i32: -define void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(1)* %in %ext = zext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -193,7 +193,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i32: -define void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(1)* %in %ext = sext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -201,7 +201,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i32: -define void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(1)* %in %ext = zext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -209,7 +209,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i32: -define void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(1)* %in %ext = sext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -221,7 +221,7 @@ ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]{{$}} ; GCN: buffer_store_dwordx2 -define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %a = load i1, i1 addrspace(1)* %in %ext = zext i1 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -233,7 +233,7 @@ ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}} ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]] ; GCN: buffer_store_dwordx2 -define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %a = load i1, i1 addrspace(1)* %in %ext = sext i1 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -241,7 +241,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i64: -define void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(1)* %in %ext = zext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -249,7 +249,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i64: -define void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(1)* %in %ext = sext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -257,7 +257,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i64: -define void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(1)* %in %ext = zext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -265,7 +265,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i64: -define void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(1)* %in %ext = sext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -273,7 +273,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i64: -define void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(1)* %in %ext = zext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(1)* %out @@ -281,7 +281,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i64: -define void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(1)* %in %ext = sext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(1)* %out @@ -289,7 +289,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i64: -define void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(1)* %in %ext = zext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -297,7 +297,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i64: -define void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(1)* %in %ext = sext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -305,7 +305,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i64: -define void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(1)* %in %ext = zext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -313,7 +313,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i64: -define void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(1)* %in %ext = sext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -321,7 +321,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i64: -define void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(1)* %in %ext = zext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -329,7 +329,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i64: -define void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(1)* %in %ext = sext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -337,7 +337,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i64: -define void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(1)* %in %ext = zext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -345,7 +345,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i64: -define void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(1)* %in %ext = sext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -353,7 +353,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i64: -define void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(1)* %in %ext = zext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -361,7 +361,7 @@ } ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i64: -define void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(1)* %in %ext = sext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out Index: test/CodeGen/AMDGPU/load-global-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i16.ll +++ test/CodeGen/AMDGPU/load-global-i16.ll @@ -11,7 +11,7 @@ ; GCN-HSA: flat_load_ushort ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { entry: %ld = load i16, i16 addrspace(1)* %in store i16 %ld, i16 addrspace(1)* %out @@ -23,7 +23,7 @@ ; GCN-HSA: flat_load_dword v ; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { entry: %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in store <2 x i16> %ld, <2 x i16> addrspace(1)* %out @@ -36,7 +36,7 @@ ; EGCM-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 -define void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in store <3 x i16> %ld, <3 x i16> addrspace(1)* %out @@ -48,7 +48,7 @@ ; GCN-HSA: flat_load_dwordx2 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { entry: %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in store <4 x i16> %ld, <4 x i16> addrspace(1)* %out @@ -60,7 +60,7 @@ ; GCN-HSA: flat_load_dwordx4 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) { entry: %ld = load <8 x i16>, <8 x i16> addrspace(1)* %in store <8 x i16> %ld, <8 x i16> addrspace(1)* %out @@ -76,7 +76,7 @@ ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) { entry: %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in store <16 x i16> %ld, <16 x i16> addrspace(1)* %out @@ -91,7 +91,7 @@ ; GCN-HSA: flat_store_dword ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %a = load i16, i16 addrspace(1)* %in %ext = zext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -108,7 +108,7 @@ ; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1 ; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EGCM: 16 -define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %a = load i16, i16 addrspace(1)* %in %ext = sext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -120,7 +120,7 @@ ; GCN-HSA: flat_load_ushort ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = zext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -134,7 +134,7 @@ ; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1 ; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EGCM: 16 -define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = sext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -148,7 +148,7 @@ ; EGCM: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; EGCM: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal ; EGCM: 16 -define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = zext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -168,7 +168,7 @@ ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV.[XYZW]}}, 0.0, literal ; EGCM-DAG: 16 ; EGCM-DAG: 16 -define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = sext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -190,7 +190,7 @@ ; EGCM: 16 ; EGCM: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal ; EGCM: AND_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], literal -define void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in %ext = zext <3 x i16> %ld to <3 x i32> @@ -214,7 +214,7 @@ ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal ; EGCM-DAG: 16 ; EGCM-DAG: 16 -define void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in %ext = sext <3 x i16> %ld to <3 x i32> @@ -237,7 +237,7 @@ ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].X, {{.*}}, literal ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{.*}}, literal ; EGCM-DAG: 16 -define void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -262,7 +262,7 @@ ; EGCM-DAG: 16 ; EGCM-DAG: 16 ; EGCM-DAG: 16 -define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = sext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -296,7 +296,7 @@ ; EGCM-DAG: 16 ; EGCM-DAG: 16 ; EGCM-DAG: 16 -define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = zext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -330,7 +330,7 @@ ; EGCM-DAG: 16 ; EGCM-DAG: 16 ; EGCM-DAG: 16 -define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = sext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -346,7 +346,7 @@ ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 -define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = zext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -357,7 +357,7 @@ ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 -define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = sext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -379,7 +379,7 @@ ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 -define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = zext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -401,7 +401,7 @@ ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 -define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = sext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -435,7 +435,7 @@ ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1 -define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = zext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -452,7 +452,7 @@ ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1 -define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = sext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -469,7 +469,7 @@ ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EGCM: MOV {{.*}}, 0.0 -define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %a = load i16, i16 addrspace(1)* %in %ext = zext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -495,7 +495,7 @@ ; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: These could be expanded earlier using ASHR 15 ; EGCM: 31 -define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %a = load i16, i16 addrspace(1)* %in %ext = sext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -506,7 +506,7 @@ ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EGCM: MOV {{.*}}, 0.0 -define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = zext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -519,7 +519,7 @@ ; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: These could be expanded earlier using ASHR 15 ; EGCM: 31 -define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = sext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -527,7 +527,7 @@ } ; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i64: -define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = zext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -537,7 +537,7 @@ ; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i64: ; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = sext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -547,7 +547,7 @@ ; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i64: ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = zext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -557,7 +557,7 @@ ; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i64: ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = sext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -567,7 +567,7 @@ ; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i64: ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = zext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -577,7 +577,7 @@ ; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i64: ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = sext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -588,7 +588,7 @@ ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = zext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -599,7 +599,7 @@ ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = sext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -612,7 +612,7 @@ ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 -define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = zext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -625,7 +625,7 @@ ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 -define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = sext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -633,7 +633,7 @@ } ; ; XFUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i64: -; define void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { +; define amdgpu_kernel void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { ; %load = load <64 x i16>, <64 x i16> addrspace(1)* %in ; %ext = zext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -641,7 +641,7 @@ ; } ; ; XFUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i64: -; define void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { +; define amdgpu_kernel void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { ; %load = load <64 x i16>, <64 x i16> addrspace(1)* %in ; %ext = sext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out Index: test/CodeGen/AMDGPU/load-global-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i32.ll +++ test/CodeGen/AMDGPU/load-global-i32.ll @@ -9,7 +9,7 @@ ; GCN-HSA: flat_load_dword ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 -define void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { entry: %ld = load i32, i32 addrspace(1)* %in store i32 %ld, i32 addrspace(1)* %out @@ -21,7 +21,7 @@ ; GCN-HSA: flat_load_dwordx2 ; EG: VTX_READ_64 -define void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { entry: %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in store <2 x i32> %ld, <2 x i32> addrspace(1)* %out @@ -33,7 +33,7 @@ ; GCN-HSA: flat_load_dwordx4 ; EG: VTX_READ_128 -define void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 { entry: %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in store <3 x i32> %ld, <3 x i32> addrspace(1)* %out @@ -45,7 +45,7 @@ ; GCN-HSA: flat_load_dwordx4 ; EG: VTX_READ_128 -define void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { entry: %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in store <4 x i32> %ld, <4 x i32> addrspace(1)* %out @@ -60,7 +60,7 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { entry: %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in store <8 x i32> %ld, <8 x i32> addrspace(1)* %out @@ -82,7 +82,7 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { entry: %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in store <16 x i32> %ld, <16 x i32> addrspace(1)* %out @@ -98,7 +98,7 @@ ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]] ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY -define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %ld = load i32, i32 addrspace(1)* %in %ext = zext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out @@ -117,7 +117,7 @@ ; EG: VTX_READ_32 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal. ; EG: 31 -define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %ld = load i32, i32 addrspace(1)* %in %ext = sext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out @@ -130,7 +130,7 @@ ; GCN-HSA: flat_load_dword ; GCN-HSA: flat_store_dwordx2 -define void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in %ext = zext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -143,7 +143,7 @@ ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in %ext = sext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -156,7 +156,7 @@ ; GCN-HSA: flat_load_dwordx2 ; GCN-HSA: flat_store_dwordx4 -define void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in %ext = zext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -172,7 +172,7 @@ ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in %ext = sext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -187,7 +187,7 @@ ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 -define void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in %ext = zext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -208,7 +208,7 @@ ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in %ext = sext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -231,7 +231,7 @@ ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-SA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in %ext = zext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -263,7 +263,7 @@ ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in %ext = sext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -309,7 +309,7 @@ ; GCN-DAG: v_ashrrev_i32 ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in %ext = sext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -344,7 +344,7 @@ ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 -define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in %ext = zext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -444,7 +444,7 @@ ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 -define void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in %ext = sext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -511,7 +511,7 @@ ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in %ext = zext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out Index: test/CodeGen/AMDGPU/load-global-i64.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i64.ll +++ test/CodeGen/AMDGPU/load-global-i64.ll @@ -13,7 +13,7 @@ ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]] ; EG: VTX_READ_64 -define void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %ld = load i64, i64 addrspace(1)* %in store i64 %ld, i64 addrspace(1)* %out ret void @@ -24,7 +24,7 @@ ; GCN-HSA: flat_load_dwordx4 ; EG: VTX_READ_128 -define void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 { entry: %ld = load <2 x i64>, <2 x i64> addrspace(1)* %in store <2 x i64> %ld, <2 x i64> addrspace(1)* %out @@ -40,7 +40,7 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 { entry: %ld = load <3 x i64>, <3 x i64> addrspace(1)* %in store <3 x i64> %ld, <3 x i64> addrspace(1)* %out @@ -56,7 +56,7 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { entry: %ld = load <4 x i64>, <4 x i64> addrspace(1)* %in store <4 x i64> %ld, <4 x i64> addrspace(1)* %out @@ -78,7 +78,7 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 { entry: %ld = load <8 x i64>, <8 x i64> addrspace(1)* %in store <8 x i64> %ld, <8 x i64> addrspace(1)* %out @@ -112,7 +112,7 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 { entry: %ld = load <16 x i64>, <16 x i64> addrspace(1)* %in store <16 x i64> %ld, <16 x i64> addrspace(1)* %out Index: test/CodeGen/AMDGPU/load-global-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i8.ll +++ test/CodeGen/AMDGPU/load-global-i8.ll @@ -11,7 +11,7 @@ ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; TODO: NOT AND -define void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { entry: %ld = load i8, i8 addrspace(1)* %in store i8 %ld, i8 addrspace(1)* %out @@ -23,7 +23,7 @@ ; GCN-HSA: flat_load_ushort v ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { entry: %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in store <2 x i8> %ld, <2 x i8> addrspace(1)* %out @@ -35,7 +35,7 @@ ; GCN-HSA: flat_load_dword v ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in store <3 x i8> %ld, <3 x i8> addrspace(1)* %out @@ -47,7 +47,7 @@ ; GCN-HSA: flat_load_dword v ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { entry: %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in store <4 x i8> %ld, <4 x i8> addrspace(1)* %out @@ -59,7 +59,7 @@ ; GCN-HSA: flat_load_dwordx2 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { entry: %ld = load <8 x i8>, <8 x i8> addrspace(1)* %in store <8 x i8> %ld, <8 x i8> addrspace(1)* %out @@ -72,7 +72,7 @@ ; GCN-HSA: flat_load_dwordx4 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { entry: %ld = load <16 x i8>, <16 x i8> addrspace(1)* %in store <16 x i8> %ld, <16 x i8> addrspace(1)* %out @@ -84,7 +84,7 @@ ; GCN-HSA: flat_load_ubyte ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %a = load i8, i8 addrspace(1)* %in %ext = zext i8 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -98,7 +98,7 @@ ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 8 -define void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %ld = load i8, i8 addrspace(1)* %in %ext = sext i8 %ld to i32 store i32 %ext, i32 addrspace(1)* %out @@ -108,7 +108,7 @@ ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i32: ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(1)* %in %ext = zext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -120,7 +120,7 @@ ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 8 -define void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(1)* %in %ext = sext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -135,7 +135,7 @@ ; TODO: These should use DST, but for some there are redundant MOVs ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal ; EG-DAG: 8 -define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in %ext = zext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -152,7 +152,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in %ext = sext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -174,7 +174,7 @@ ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in %ext = zext <3 x i8> %ld to <3 x i32> @@ -207,7 +207,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in %ext = sext <3 x i8> %ld to <3 x i32> @@ -227,7 +227,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in %ext = zext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -248,7 +248,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in %ext = sext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -273,7 +273,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in %ext = zext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -300,7 +300,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in %ext = sext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -341,7 +341,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(1)* %in %ext = zext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -384,7 +384,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(1)* %in %ext = sext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -456,7 +456,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(1)* %in %ext = zext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -532,7 +532,7 @@ ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(1)* %in %ext = sext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -545,7 +545,7 @@ ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1 -define void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { %load = load <64 x i8>, <64 x i8> addrspace(1)* %in %ext = zext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -558,7 +558,7 @@ ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1 -define void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { %load = load <64 x i8>, <64 x i8> addrspace(1)* %in %ext = sext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -576,7 +576,7 @@ ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %a = load i8, i8 addrspace(1)* %in %ext = zext i8 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -595,7 +595,7 @@ ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: Why not 7 ? ; EG: 31 -define void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %a = load i8, i8 addrspace(1)* %in %ext = sext i8 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -606,7 +606,7 @@ ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(1)* %in %ext = zext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -619,7 +619,7 @@ ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: Why not 7 ? ; EG: 31 -define void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(1)* %in %ext = sext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -629,7 +629,7 @@ ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i64: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in %ext = zext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -639,7 +639,7 @@ ; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i64: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in %ext = sext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -649,7 +649,7 @@ ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in %ext = zext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -659,7 +659,7 @@ ; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in %ext = sext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -669,7 +669,7 @@ ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in %ext = zext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -679,7 +679,7 @@ ; FUNC-LABEL: {{^}}global_sextload_v8i8_to_v8i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in %ext = sext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -689,7 +689,7 @@ ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(1)* %in %ext = zext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -699,7 +699,7 @@ ; FUNC-LABEL: {{^}}global_sextload_v16i8_to_v16i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(1)* %in %ext = sext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -710,7 +710,7 @@ ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(1)* %in %ext = zext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -721,7 +721,7 @@ ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(1)* %in %ext = sext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -729,7 +729,7 @@ } ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i64: -; define void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { +; define amdgpu_kernel void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in ; %ext = zext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -737,7 +737,7 @@ ; } ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i64: -; define void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { +; define amdgpu_kernel void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in ; %ext = sext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -752,7 +752,7 @@ ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]] ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %a = load i8, i8 addrspace(1)* %in %ext = zext i8 %a to i16 store i16 %ext, i16 addrspace(1)* %out @@ -768,7 +768,7 @@ ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %a = load i8, i8 addrspace(1)* %in %ext = sext i8 %a to i16 store i16 %ext, i16 addrspace(1)* %out @@ -778,7 +778,7 @@ ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i16: ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(1)* %in %ext = zext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(1)* %out @@ -789,7 +789,7 @@ ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(1)* %in %ext = sext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(1)* %out @@ -799,7 +799,7 @@ ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i16: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in %ext = zext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(1)* %out @@ -811,7 +811,7 @@ ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in %ext = sext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(1)* %out @@ -821,7 +821,7 @@ ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i16: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in %ext = zext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(1)* %out @@ -835,7 +835,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in %ext = sext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(1)* %out @@ -845,7 +845,7 @@ ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i16: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in %ext = zext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(1)* %out @@ -863,7 +863,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in %ext = sext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(1)* %out @@ -873,7 +873,7 @@ ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i16: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(1)* %in %ext = zext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(1)* %out @@ -899,7 +899,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(1)* %in %ext = sext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(1)* %out @@ -910,7 +910,7 @@ ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(1)* %in %ext = zext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(1)* %out @@ -953,7 +953,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(1)* %in %ext = sext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(1)* %out @@ -961,7 +961,7 @@ } ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i16: -; define void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { +; define amdgpu_kernel void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in ; %ext = zext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out @@ -969,7 +969,7 @@ ; } ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i16: -; define void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { +; define amdgpu_kernel void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in ; %ext = sext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out Index: test/CodeGen/AMDGPU/load-local-f32.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-f32.ll +++ test/CodeGen/AMDGPU/load-local-f32.ll @@ -7,7 +7,7 @@ ; GCN: ds_read_b32 ; EG: LDS_READ_RET -define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 { +define amdgpu_kernel void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 { entry: %tmp0 = load float, float addrspace(3)* %in store float %tmp0, float addrspace(1)* %out @@ -20,7 +20,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 { +define amdgpu_kernel void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 { entry: %tmp0 = load <2 x float>, <2 x float> addrspace(3)* %in store <2 x float> %tmp0, <2 x float> addrspace(1)* %out @@ -38,7 +38,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 { entry: %tmp0 = load <3 x float>, <3 x float> addrspace(3)* %in store <3 x float> %tmp0, <3 x float> addrspace(3)* %out @@ -52,7 +52,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 { entry: %tmp0 = load <4 x float>, <4 x float> addrspace(3)* %in store <4 x float> %tmp0, <4 x float> addrspace(3)* %out @@ -71,7 +71,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 { entry: %tmp0 = load <8 x float>, <8 x float> addrspace(3)* %in store <8 x float> %tmp0, <8 x float> addrspace(3)* %out @@ -100,7 +100,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 { entry: %tmp0 = load <16 x float>, <16 x float> addrspace(3)* %in store <16 x float> %tmp0, <16 x float> addrspace(3)* %out Index: test/CodeGen/AMDGPU/load-local-f64.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-f64.ll +++ test/CodeGen/AMDGPU/load-local-f64.ll @@ -9,7 +9,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 { %ld = load double, double addrspace(3)* %in store double %ld, double addrspace(3)* %out ret void @@ -22,7 +22,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 { entry: %ld = load <2 x double>, <2 x double> addrspace(3)* %in store <2 x double> %ld, <2 x double> addrspace(3)* %out @@ -39,7 +39,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 { entry: %ld = load <3 x double>, <3 x double> addrspace(3)* %in store <3 x double> %ld, <3 x double> addrspace(3)* %out @@ -59,7 +59,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 { entry: %ld = load <4 x double>, <4 x double> addrspace(3)* %in store <4 x double> %ld, <4 x double> addrspace(3)* %out @@ -88,7 +88,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 { entry: %ld = load <8 x double>, <8 x double> addrspace(3)* %in store <8 x double> %ld, <8 x double> addrspace(3)* %out @@ -144,7 +144,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 { entry: %ld = load <16 x double>, <16 x double> addrspace(3)* %in store <16 x double> %ld, <16 x double> addrspace(3)* %out Index: test/CodeGen/AMDGPU/load-local-i1.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i1.ll +++ test/CodeGen/AMDGPU/load-local-i1.ll @@ -10,56 +10,56 @@ ; EG: LDS_UBYTE_READ_RET ; EG: AND_INT ; EG: LDS_BYTE_WRITE -define void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { %load = load i1, i1 addrspace(3)* %in store i1 %load, i1 addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v2i1: -define void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in store <2 x i1> %load, <2 x i1> addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v3i1: -define void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in store <3 x i1> %load, <3 x i1> addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v4i1: -define void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in store <4 x i1> %load, <4 x i1> addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v8i1: -define void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in store <8 x i1> %load, <8 x i1> addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v16i1: -define void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in store <16 x i1> %load, <16 x i1> addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v32i1: -define void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in store <32 x i1> %load, <32 x i1> addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v64i1: -define void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in store <64 x i1> %load, <64 x i1> addrspace(3)* %out ret void @@ -68,7 +68,7 @@ ; FUNC-LABEL: {{^}}local_zextload_i1_to_i32: ; GCN: ds_read_u8 ; GCN: ds_write_b32 -define void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { %a = load i1, i1 addrspace(3)* %in %ext = zext i1 %a to i32 store i32 %ext, i32 addrspace(3)* %out @@ -82,7 +82,7 @@ ; EG: LDS_UBYTE_READ_RET ; EG: BFE_INT -define void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { %a = load i1, i1 addrspace(3)* %in %ext = sext i1 %a to i32 store i32 %ext, i32 addrspace(3)* %out @@ -90,7 +90,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i32: -define void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(3)* %in %ext = zext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(3)* %out @@ -98,7 +98,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i32: -define void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(3)* %in %ext = sext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(3)* %out @@ -106,7 +106,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i32: -define void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in %ext = zext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(3)* %out @@ -114,7 +114,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i32: -define void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in %ext = sext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(3)* %out @@ -122,7 +122,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i32: -define void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in %ext = zext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(3)* %out @@ -130,7 +130,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i32: -define void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in %ext = sext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(3)* %out @@ -138,7 +138,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i32: -define void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in %ext = zext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(3)* %out @@ -146,7 +146,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i32: -define void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in %ext = sext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(3)* %out @@ -154,7 +154,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i32: -define void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in %ext = zext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(3)* %out @@ -162,7 +162,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i32: -define void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in %ext = sext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(3)* %out @@ -170,7 +170,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i32: -define void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in %ext = zext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(3)* %out @@ -178,7 +178,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i32: -define void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in %ext = sext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(3)* %out @@ -186,7 +186,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i32: -define void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in %ext = zext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(3)* %out @@ -194,7 +194,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i32: -define void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in %ext = sext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(3)* %out @@ -202,7 +202,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i32: -define void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in %ext = zext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(3)* %out @@ -210,7 +210,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i32: -define void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in %ext = sext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(3)* %out @@ -221,7 +221,7 @@ ; GCN-DAG: ds_read_u8 [[LOAD:v[0-9]+]], ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} ; GCN: ds_write_b64 -define void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { %a = load i1, i1 addrspace(3)* %in %ext = zext i1 %a to i64 store i64 %ext, i64 addrspace(3)* %out @@ -233,7 +233,7 @@ ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}} ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]] ; GCN: ds_write_b64 -define void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { %a = load i1, i1 addrspace(3)* %in %ext = sext i1 %a to i64 store i64 %ext, i64 addrspace(3)* %out @@ -241,7 +241,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i64: -define void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(3)* %in %ext = zext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -249,7 +249,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i64: -define void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(3)* %in %ext = sext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -257,7 +257,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i64: -define void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in %ext = zext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -265,7 +265,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i64: -define void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in %ext = sext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -273,7 +273,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i64: -define void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in %ext = zext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(3)* %out @@ -281,7 +281,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i64: -define void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in %ext = sext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(3)* %out @@ -289,7 +289,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i64: -define void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in %ext = zext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -297,7 +297,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i64: -define void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in %ext = sext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -305,7 +305,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i64: -define void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in %ext = zext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -313,7 +313,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i64: -define void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in %ext = sext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -321,7 +321,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i64: -define void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in %ext = zext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -329,7 +329,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i64: -define void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in %ext = sext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -337,7 +337,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i64: -define void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in %ext = zext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -345,7 +345,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i64: -define void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in %ext = sext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -353,7 +353,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i64: -define void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in %ext = zext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(3)* %out @@ -361,7 +361,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i64: -define void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in %ext = sext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(3)* %out Index: test/CodeGen/AMDGPU/load-local-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i16.ll +++ test/CodeGen/AMDGPU/load-local-i16.ll @@ -10,7 +10,7 @@ ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y ; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) { +define amdgpu_kernel void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) { entry: %ld = load i16, i16 addrspace(3)* %in store i16 %ld, i16 addrspace(3)* %out @@ -25,7 +25,7 @@ ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) { entry: %ld = load <2 x i16>, <2 x i16> addrspace(3)* %in store <2 x i16> %ld, <2 x i16> addrspace(3)* %out @@ -39,7 +39,7 @@ ; EG-DAG: LDS_USHORT_READ_RET ; EG-DAG: LDS_READ_RET -define void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in store <3 x i16> %ld, <3 x i16> addrspace(3)* %out @@ -51,7 +51,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) { entry: %ld = load <4 x i16>, <4 x i16> addrspace(3)* %in store <4 x i16> %ld, <4 x i16> addrspace(3)* %out @@ -65,7 +65,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) { entry: %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in store <8 x i16> %ld, <8 x i16> addrspace(3)* %out @@ -86,7 +86,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) { entry: %ld = load <16 x i16>, <16 x i16> addrspace(3)* %in store <16 x i16> %ld, <16 x i16> addrspace(3)* %out @@ -102,7 +102,7 @@ ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { %a = load i16, i16 addrspace(3)* %in %ext = zext i16 %a to i32 store i32 %ext, i32 addrspace(3)* %out @@ -121,7 +121,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal ; EG: 16 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { %a = load i16, i16 addrspace(3)* %in %ext = sext i16 %a to i32 store i32 %ext, i32 addrspace(3)* %out @@ -136,7 +136,7 @@ ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(3)* %in %ext = zext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(3)* %out @@ -153,7 +153,7 @@ ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal ; EG: 16 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(3)* %in %ext = sext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(3)* %out @@ -166,7 +166,7 @@ ; GCN: ds_read_b32 ; EG: LDS_READ_RET -define void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(3)* %in %ext = zext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(3)* %out @@ -181,7 +181,7 @@ ; EG: LDS_READ_RET ; EG: BFE_INT ; EG: BFE_INT -define void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(3)* %in %ext = sext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(3)* %out @@ -194,7 +194,7 @@ ; GCN-DAG: ds_write_b64 ; EG: LDS_READ_RET -define void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in %ext = zext <3 x i16> %ld to <3 x i32> @@ -211,7 +211,7 @@ ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in %ext = sext <3 x i16> %ld to <3 x i32> @@ -226,7 +226,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(3)* %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(3)* %out @@ -244,7 +244,7 @@ ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(3)* %in %ext = sext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(3)* %out @@ -258,7 +258,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(3)* %in %ext = zext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(3)* %out @@ -280,7 +280,7 @@ ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(3)* %in %ext = sext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(3)* %out @@ -304,7 +304,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in %ext = zext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(3)* %out @@ -340,7 +340,7 @@ ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in %ext = sext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(3)* %out @@ -369,7 +369,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in %ext = zext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(3)* %out @@ -406,7 +406,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in %ext = sext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(3)* %out @@ -471,7 +471,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(3)* %in %ext = zext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(3)* %out @@ -512,7 +512,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(3)* %in %ext = sext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(3)* %out @@ -531,7 +531,7 @@ ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y ; EG-DAG: LDS_WRITE ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { %a = load i16, i16 addrspace(3)* %in %ext = zext i16 %a to i64 store i64 %ext, i64 addrspace(3)* %out @@ -558,7 +558,7 @@ ; EG-DAG: LDS_WRITE ; EG-DAG: 16 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { %a = load i16, i16 addrspace(3)* %in %ext = sext i16 %a to i64 store i64 %ext, i64 addrspace(3)* %out @@ -573,7 +573,7 @@ ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y ; EG-DAG: LDS_WRITE ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(3)* %in %ext = zext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -590,7 +590,7 @@ ; EG-DAG: LDS_WRITE ; EG-DAG: 16 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(3)* %in %ext = sext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -600,7 +600,7 @@ ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64: ; EG: LDS_READ_RET -define void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(3)* %in %ext = zext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -612,7 +612,7 @@ ; EG: LDS_READ_RET ; EG-DAG: BFE_INT ; EG-DAG: ASHR -define void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(3)* %in %ext = sext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -623,7 +623,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(3)* %in %ext = zext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -638,7 +638,7 @@ ; EG-DAG: BFE_INT ; EG-DAG: ASHR ; EG-DAG: ASHR -define void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(3)* %in %ext = sext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -651,7 +651,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(3)* %in %ext = zext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -672,7 +672,7 @@ ; EG-DAG: BFE_INT ; EG-DAG: ASHR ; EG-DAG: ASHR -define void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(3)* %in %ext = sext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -689,7 +689,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in %ext = zext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -722,7 +722,7 @@ ; EG-DAG: BFE_INT ; EG-DAG: ASHR ; EG-DAG: ASHR -define void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in %ext = sext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -747,7 +747,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in %ext = zext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -804,7 +804,7 @@ ; EG-DAG: BFE_INT ; EG-DAG: ASHR ; EG-DAG: ASHR -define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in %ext = sext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -812,7 +812,7 @@ } ; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64: -; define void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { +; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { ; %load = load <64 x i16>, <64 x i16> addrspace(3)* %in ; %ext = zext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out @@ -820,7 +820,7 @@ ; } ; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64: -; define void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { +; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { ; %load = load <64 x i16>, <64 x i16> addrspace(3)* %in ; %ext = sext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out Index: test/CodeGen/AMDGPU/load-local-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i32.ll +++ test/CodeGen/AMDGPU/load-local-i32.ll @@ -9,7 +9,7 @@ ; GCN: ds_read_b32 ; EG: LDS_READ_RET -define void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { entry: %ld = load i32, i32 addrspace(3)* %in store i32 %ld, i32 addrspace(3)* %out @@ -18,7 +18,7 @@ ; FUNC-LABEL: {{^}}local_load_v2i32: ; GCN: ds_read_b64 -define void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { entry: %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in store <2 x i32> %ld, <2 x i32> addrspace(3)* %out @@ -28,7 +28,7 @@ ; FUNC-LABEL: {{^}}local_load_v3i32: ; GCN-DAG: ds_read_b64 ; GCN-DAG: ds_read_b32 -define void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 { entry: %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in store <3 x i32> %ld, <3 x i32> addrspace(3)* %out @@ -38,7 +38,7 @@ ; FUNC-LABEL: {{^}}local_load_v4i32: ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -define void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { entry: %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in store <4 x i32> %ld, <4 x i32> addrspace(3)* %out @@ -48,7 +48,7 @@ ; FUNC-LABEL: {{^}}local_load_v8i32: ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -define void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { entry: %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in store <8 x i32> %ld, <8 x i32> addrspace(3)* %out @@ -64,7 +64,7 @@ ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 -define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { entry: %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in store <16 x i32> %ld, <16 x i32> addrspace(3)* %out @@ -72,7 +72,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_i32_to_i64: -define void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { %ld = load i32, i32 addrspace(3)* %in %ext = zext i32 %ld to i64 store i64 %ext, i64 addrspace(3)* %out @@ -80,7 +80,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_i32_to_i64: -define void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { %ld = load i32, i32 addrspace(3)* %in %ext = sext i32 %ld to i64 store i64 %ext, i64 addrspace(3)* %out @@ -88,7 +88,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64: -define void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in %ext = zext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -96,7 +96,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64: -define void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in %ext = sext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -104,7 +104,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64: -define void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in %ext = zext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -112,7 +112,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64: -define void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in %ext = sext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -120,7 +120,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64: -define void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in %ext = zext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -128,7 +128,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64: -define void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in %ext = sext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -136,7 +136,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64: -define void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in %ext = zext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -144,7 +144,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64: -define void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in %ext = sext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -152,7 +152,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64: -define void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in %ext = sext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -160,7 +160,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64 -define void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in %ext = zext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -168,7 +168,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64: -define void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in %ext = sext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -176,7 +176,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64: -define void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in %ext = zext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out Index: test/CodeGen/AMDGPU/load-local-i64.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i64.ll +++ test/CodeGen/AMDGPU/load-local-i64.ll @@ -9,7 +9,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 { %ld = load i64, i64 addrspace(3)* %in store i64 %ld, i64 addrspace(3)* %out ret void @@ -22,7 +22,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 { entry: %ld = load <2 x i64>, <2 x i64> addrspace(3)* %in store <2 x i64> %ld, <2 x i64> addrspace(3)* %out @@ -39,7 +39,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 { entry: %ld = load <3 x i64>, <3 x i64> addrspace(3)* %in store <3 x i64> %ld, <3 x i64> addrspace(3)* %out @@ -59,7 +59,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 { entry: %ld = load <4 x i64>, <4 x i64> addrspace(3)* %in store <4 x i64> %ld, <4 x i64> addrspace(3)* %out @@ -88,7 +88,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 { entry: %ld = load <8 x i64>, <8 x i64> addrspace(3)* %in store <8 x i64> %ld, <8 x i64> addrspace(3)* %out @@ -144,7 +144,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 { entry: %ld = load <16 x i64>, <16 x i64> addrspace(3)* %in store <16 x i64> %ld, <16 x i64> addrspace(3)* %out Index: test/CodeGen/AMDGPU/load-local-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i8.ll +++ test/CodeGen/AMDGPU/load-local-i8.ll @@ -9,7 +9,7 @@ ; GCN: ds_read_u8 ; EG: LDS_UBYTE_READ_RET -define void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { entry: %ld = load i8, i8 addrspace(3)* %in store i8 %ld, i8 addrspace(3)* %out @@ -22,7 +22,7 @@ ; GCN: ds_read_u16 ; EG: LDS_USHORT_READ_RET -define void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { entry: %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in store <2 x i8> %ld, <2 x i8> addrspace(3)* %out @@ -33,7 +33,7 @@ ; GCN: ds_read_b32 ; EG: DS_READ_RET -define void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in store <3 x i8> %ld, <3 x i8> addrspace(3)* %out @@ -44,7 +44,7 @@ ; GCN: ds_read_b32 ; EG: LDS_READ_RET -define void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { entry: %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in store <4 x i8> %ld, <4 x i8> addrspace(3)* %out @@ -56,7 +56,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { entry: %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in store <8 x i8> %ld, <8 x i8> addrspace(3)* %out @@ -71,7 +71,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { entry: %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in store <16 x i8> %ld, <16 x i8> addrspace(3)* %out @@ -84,7 +84,7 @@ ; GCN: ds_read_u8 ; EG: LDS_UBYTE_READ_RET -define void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { %a = load i8, i8 addrspace(3)* %in %ext = zext i8 %a to i32 store i32 %ext, i32 addrspace(3)* %out @@ -98,7 +98,7 @@ ; EG: LDS_UBYTE_READ_RET ; EG: BFE_INT -define void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { %ld = load i8, i8 addrspace(3)* %in %ext = sext i8 %ld to i32 store i32 %ext, i32 addrspace(3)* %out @@ -108,7 +108,7 @@ ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32: ; EG: LDS_UBYTE_READ_RET -define void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(3)* %in %ext = zext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(3)* %out @@ -119,7 +119,7 @@ ; EG: LDS_UBYTE_READ_RET ; EG: BFE_INT -define void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(3)* %in %ext = sext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(3)* %out @@ -130,7 +130,7 @@ ; GCN: ds_read_u16 ; EG: LDS_USHORT_READ_RET -define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(3)* %in %ext = zext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(3)* %out @@ -156,7 +156,7 @@ ; EG: LDS_USHORT_READ_RET ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(3)* %in %ext = sext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(3)* %out @@ -172,7 +172,7 @@ ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, ; EG: LDS_READ_RET -define void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in %ext = zext <3 x i8> %ld to <3 x i32> @@ -197,7 +197,7 @@ ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in %ext = sext <3 x i8> %ld to <3 x i32> @@ -214,7 +214,7 @@ ; EG-DAG: BFE_UINT ; EG-DAG: BFE_UINT ; EG-DAG: BFE_UINT -define void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(3)* %in %ext = zext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(3)* %out @@ -231,7 +231,7 @@ ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(3)* %in %ext = sext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(3)* %out @@ -248,7 +248,7 @@ ; EG-DAG: BFE_UINT ; EG-DAG: BFE_UINT ; EG-DAG: BFE_UINT -define void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(3)* %in %ext = zext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(3)* %out @@ -267,7 +267,7 @@ ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(3)* %in %ext = sext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(3)* %out @@ -292,7 +292,7 @@ ; EG-DAG: BFE_UINT ; EG-DAG: BFE_UINT ; EG-DAG: BFE_UINT -define void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(3)* %in %ext = zext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(3)* %out @@ -321,7 +321,7 @@ ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(3)* %in %ext = sext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(3)* %out @@ -338,7 +338,7 @@ ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET -define void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(3)* %in %ext = zext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(3)* %out @@ -355,7 +355,7 @@ ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET -define void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(3)* %in %ext = sext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(3)* %out @@ -380,7 +380,7 @@ ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET -define void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { %load = load <64 x i8>, <64 x i8> addrspace(3)* %in %ext = zext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(3)* %out @@ -405,7 +405,7 @@ ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET -define void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { %load = load <64 x i8>, <64 x i8> addrspace(3)* %in %ext = sext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(3)* %out @@ -420,7 +420,7 @@ ; EG: LDS_UBYTE_READ_RET ; EG: MOV {{.*}}, literal ; EG: 0.0 -define void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { %a = load i8, i8 addrspace(3)* %in %ext = zext i8 %a to i64 store i64 %ext, i64 addrspace(3)* %out @@ -437,7 +437,7 @@ ; EG: ASHR ; TODO: why not 7? ; EG: 31 -define void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { %a = load i8, i8 addrspace(3)* %in %ext = sext i8 %a to i64 store i64 %ext, i64 addrspace(3)* %out @@ -450,7 +450,7 @@ ; EG: MOV {{.*}}, literal ; TODO: merge? ; EG: 0.0 -define void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(3)* %in %ext = zext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -463,7 +463,7 @@ ; EG: ASHR ; TODO: why not 7? ; EG: 31 -define void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(3)* %in %ext = sext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -473,7 +473,7 @@ ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64: ; EG: LDS_USHORT_READ_RET -define void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(3)* %in %ext = zext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -485,7 +485,7 @@ ; EG: LDS_USHORT_READ_RET ; EG: BFE_INT ; EG: BFE_INT -define void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(3)* %in %ext = sext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -495,7 +495,7 @@ ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64: ; EG: LDS_READ_RET -define void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(3)* %in %ext = zext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -505,7 +505,7 @@ ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64: ; EG: LDS_READ_RET -define void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(3)* %in %ext = sext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -516,7 +516,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(3)* %in %ext = zext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -536,7 +536,7 @@ ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(3)* %in %ext = sext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -549,7 +549,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(3)* %in %ext = zext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -562,7 +562,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(3)* %in %ext = sext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -579,7 +579,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(3)* %in %ext = zext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -596,7 +596,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(3)* %in %ext = sext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -604,7 +604,7 @@ } ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64: -; define void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { +; define amdgpu_kernel void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in ; %ext = zext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out @@ -612,7 +612,7 @@ ; } ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64: -; define void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { +; define amdgpu_kernel void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in ; %ext = sext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out @@ -625,7 +625,7 @@ ; EG: LDS_UBYTE_READ_RET ; EG: LDS_SHORT_WRITE -define void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { %a = load i8, i8 addrspace(3)* %in %ext = zext i8 %a to i16 store i16 %ext, i16 addrspace(3)* %out @@ -639,7 +639,7 @@ ; EG: LDS_UBYTE_READ_RET ; EG: BFE_INT ; EG: LDS_SHORT_WRITE -define void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { %a = load i8, i8 addrspace(3)* %in %ext = sext i8 %a to i16 store i16 %ext, i16 addrspace(3)* %out @@ -650,7 +650,7 @@ ; EG: LDS_UBYTE_READ_RET ; EG: LDS_SHORT_WRITE -define void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(3)* %in %ext = zext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(3)* %out @@ -662,7 +662,7 @@ ; EG: LDS_UBYTE_READ_RET ; EG: BFE_INT ; EG: LDS_SHORT_WRITE -define void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(3)* %in %ext = sext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(3)* %out @@ -673,7 +673,7 @@ ; EG: LDS_USHORT_READ_RET ; EG: LDS_WRITE -define void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(3)* %in %ext = zext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(3)* %out @@ -686,7 +686,7 @@ ; EG: BFE_INT ; EG: BFE_INT ; EG: LDS_WRITE -define void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(3)* %in %ext = sext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(3)* %out @@ -698,7 +698,7 @@ ; EG: LDS_READ_RET ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(3)* %in %ext = zext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(3)* %out @@ -715,7 +715,7 @@ ; EG-DAG: BFE_INT ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(3)* %in %ext = sext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(3)* %out @@ -730,7 +730,7 @@ ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(3)* %in %ext = zext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(3)* %out @@ -754,7 +754,7 @@ ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(3)* %in %ext = sext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(3)* %out @@ -775,7 +775,7 @@ ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(3)* %in %ext = zext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(3)* %out @@ -813,7 +813,7 @@ ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(3)* %in %ext = sext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(3)* %out @@ -846,7 +846,7 @@ ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(3)* %in %ext = zext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(3)* %out @@ -908,7 +908,7 @@ ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(3)* %in %ext = sext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(3)* %out @@ -916,7 +916,7 @@ } ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16: -; define void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { +; define amdgpu_kernel void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in ; %ext = zext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(3)* %out @@ -924,7 +924,7 @@ ; } ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16: -; define void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { +; define amdgpu_kernel void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in ; %ext = sext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(3)* %out Index: test/CodeGen/AMDGPU/load-weird-sizes.ll =================================================================== --- test/CodeGen/AMDGPU/load-weird-sizes.ll +++ test/CodeGen/AMDGPU/load-weird-sizes.ll @@ -8,7 +8,7 @@ ; SI: {{flat|buffer}}_load_ubyte ; SI: {{flat|buffer}}_load_ushort ; SI: {{flat|buffer}}_store_dword -define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 { +define amdgpu_kernel void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 { %1 = load i24, i24 addrspace(1)* %in %2 = zext i24 %1 to i32 store i32 %2, i32 addrspace(1)* %out @@ -21,7 +21,7 @@ ; CI-HSA: flat_load_dword [[VAL:v[0-9]+]] ; CI-HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAL]] -define void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 { +define amdgpu_kernel void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 { %1 = load i25, i25 addrspace(1)* %in %2 = zext i25 %1 to i32 store i32 %2, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/local-64.ll =================================================================== --- test/CodeGen/AMDGPU/local-64.ll +++ test/CodeGen/AMDGPU/local-64.ll @@ -5,7 +5,7 @@ ; BOTH-LABEL: {{^}}local_i32_load ; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 ; BOTH: buffer_store_dword [[REG]], -define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 %val = load i32, i32 addrspace(3)* %gep, align 4 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -15,7 +15,7 @@ ; BOTH-LABEL: {{^}}local_i32_load_0_offset ; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} ; BOTH: buffer_store_dword [[REG]], -define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { %val = load i32, i32 addrspace(3)* %in, align 4 store i32 %val, i32 addrspace(1)* %out, align 4 ret void @@ -25,7 +25,7 @@ ; BOTH-NOT: ADD ; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 ; BOTH: buffer_store_byte [[REG]], -define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535 %val = load i8, i8 addrspace(3)* %gep, align 4 store i8 %val, i8 addrspace(1)* %out, align 4 @@ -40,7 +40,7 @@ ; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] ; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] ; BOTH: buffer_store_byte [[REG]], -define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536 %val = load i8, i8 addrspace(3)* %gep, align 4 store i8 %val, i8 addrspace(1)* %out, align 4 @@ -51,7 +51,7 @@ ; BOTH-NOT: ADD ; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 ; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7 %val = load i64, i64 addrspace(3)* %gep, align 8 store i64 %val, i64 addrspace(1)* %out, align 8 @@ -61,7 +61,7 @@ ; BOTH-LABEL: {{^}}local_i64_load_0_offset ; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { %val = load i64, i64 addrspace(3)* %in, align 8 store i64 %val, i64 addrspace(1)* %out, align 8 ret void @@ -71,7 +71,7 @@ ; BOTH-NOT: ADD ; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 ; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { %gep = getelementptr double, double addrspace(3)* %in, i32 7 %val = load double, double addrspace(3)* %gep, align 8 store double %val, double addrspace(1)* %out, align 8 @@ -81,7 +81,7 @@ ; BOTH-LABEL: {{^}}local_f64_load_0_offset ; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { %val = load double, double addrspace(3)* %in, align 8 store double %val, double addrspace(1)* %out, align 8 ret void @@ -90,7 +90,7 @@ ; BOTH-LABEL: {{^}}local_i64_store: ; BOTH-NOT: ADD ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define void @local_i64_store(i64 addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_i64_store(i64 addrspace(3)* %out) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7 store i64 5678, i64 addrspace(3)* %gep, align 8 ret void @@ -99,7 +99,7 @@ ; BOTH-LABEL: {{^}}local_i64_store_0_offset: ; BOTH-NOT: ADD ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind { store i64 1234, i64 addrspace(3)* %out, align 8 ret void } @@ -107,7 +107,7 @@ ; BOTH-LABEL: {{^}}local_f64_store: ; BOTH-NOT: ADD ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define void @local_f64_store(double addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_f64_store(double addrspace(3)* %out) nounwind { %gep = getelementptr double, double addrspace(3)* %out, i32 7 store double 16.0, double addrspace(3)* %gep, align 8 ret void @@ -115,7 +115,7 @@ ; BOTH-LABEL: {{^}}local_f64_store_0_offset ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind { store double 20.0, double addrspace(3)* %out, align 8 ret void } @@ -124,7 +124,7 @@ ; BOTH-NOT: ADD ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 ; BOTH: s_endpgm -define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7 store <2 x i64> , <2 x i64> addrspace(3)* %gep, align 16 ret void @@ -134,7 +134,7 @@ ; BOTH-NOT: ADD ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 ; BOTH: s_endpgm -define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { store <2 x i64> , <2 x i64> addrspace(3)* %out, align 16 ret void } @@ -144,7 +144,7 @@ ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 ; BOTH: s_endpgm -define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7 store <4 x i64> , <4 x i64> addrspace(3)* %gep, align 16 ret void @@ -155,7 +155,7 @@ ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 ; BOTH: s_endpgm -define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind { store <4 x i64> , <4 x i64> addrspace(3)* %out, align 16 ret void } Index: test/CodeGen/AMDGPU/local-atomics.ll =================================================================== --- test/CodeGen/AMDGPU/local-atomics.ll +++ test/CodeGen/AMDGPU/local-atomics.ll @@ -11,7 +11,7 @@ ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ ; EG: LDS_WRXCHG_RET * ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -37,7 +37,7 @@ ; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -47,7 +47,7 @@ ; EG: LDS_ADD_RET * ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -59,7 +59,7 @@ ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { %sub = sub i32 %a, %b %add = add i32 %sub, 4 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add @@ -73,7 +73,7 @@ ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] ; GCN: s_endpgm -define void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -84,7 +84,7 @@ ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16 ; GCN: s_endpgm -define void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -96,7 +96,7 @@ ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { %sub = sub i32 %a, %b %add = add i32 %sub, 4 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add @@ -109,7 +109,7 @@ ; EG: LDS_SUB_RET * ; GCN: ds_sub_rtn_u32 ; GCN: s_endpgm -define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -119,7 +119,7 @@ ; EG: LDS_SUB_RET * ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -131,7 +131,7 @@ ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] ; GCN: s_endpgm -define void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -142,7 +142,7 @@ ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16 ; GCN: s_endpgm -define void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -153,7 +153,7 @@ ; EG: LDS_AND_RET * ; GCN: ds_and_rtn_b32 ; GCN: s_endpgm -define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -163,7 +163,7 @@ ; EG: LDS_AND_RET * ; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -174,7 +174,7 @@ ; EG: LDS_OR_RET * ; GCN: ds_or_rtn_b32 ; GCN: s_endpgm -define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -184,7 +184,7 @@ ; EG: LDS_OR_RET * ; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -195,7 +195,7 @@ ; EG: LDS_XOR_RET * ; GCN: ds_xor_rtn_b32 ; GCN: s_endpgm -define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -205,7 +205,7 @@ ; EG: LDS_XOR_RET * ; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -214,7 +214,7 @@ ; FIXME: There is no atomic nand instr ; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +; define amdgpu_kernel void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { ; %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst ; store i32 %result, i32 addrspace(1)* %out, align 4 ; ret void @@ -224,7 +224,7 @@ ; EG: LDS_MIN_INT_RET * ; GCN: ds_min_rtn_i32 ; GCN: s_endpgm -define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -234,7 +234,7 @@ ; EG: LDS_MIN_INT_RET * ; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -245,7 +245,7 @@ ; EG: LDS_MAX_INT_RET * ; GCN: ds_max_rtn_i32 ; GCN: s_endpgm -define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -255,7 +255,7 @@ ; EG: LDS_MAX_INT_RET * ; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -266,7 +266,7 @@ ; EG: LDS_MIN_UINT_RET * ; GCN: ds_min_rtn_u32 ; GCN: s_endpgm -define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -276,7 +276,7 @@ ; EG: LDS_MIN_UINT_RET * ; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -287,7 +287,7 @@ ; EG: LDS_MAX_UINT_RET * ; GCN: ds_max_rtn_u32 ; GCN: s_endpgm -define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -297,7 +297,7 @@ ; EG: LDS_MAX_UINT_RET * ; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -310,7 +310,7 @@ ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -318,7 +318,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset: ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -330,7 +330,7 @@ ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_add_u32 [[VPTR]], [[DATA]] ; GCN: s_endpgm -define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -338,7 +338,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset: ; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -348,7 +348,7 @@ ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { %sub = sub i32 %a, %b %add = add i32 %sub, 4 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add @@ -360,7 +360,7 @@ ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] ; GCN: s_endpgm -define void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst ret void } @@ -369,7 +369,7 @@ ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] offset:16 ; GCN: s_endpgm -define void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst ret void @@ -379,7 +379,7 @@ ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { %sub = sub i32 %a, %b %add = add i32 %sub, 4 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add @@ -390,7 +390,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32: ; GCN: ds_sub_u32 ; GCN: s_endpgm -define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -398,7 +398,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset: ; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -408,7 +408,7 @@ ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] ; GCN: s_endpgm -define void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst ret void } @@ -417,7 +417,7 @@ ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] offset:16 ; GCN: s_endpgm -define void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst ret void @@ -426,7 +426,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32: ; GCN: ds_and_b32 ; GCN: s_endpgm -define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -434,7 +434,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset: ; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -443,7 +443,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32: ; GCN: ds_or_b32 ; GCN: s_endpgm -define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -451,7 +451,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset: ; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -460,7 +460,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32: ; GCN: ds_xor_b32 ; GCN: s_endpgm -define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -468,7 +468,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset: ; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -476,7 +476,7 @@ ; FIXME: There is no atomic nand instr ; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind { +; define amdgpu_kernel void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst ; ret void ; } @@ -484,7 +484,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32: ; GCN: ds_min_i32 ; GCN: s_endpgm -define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -492,7 +492,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset: ; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -501,7 +501,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32: ; GCN: ds_max_i32 ; GCN: s_endpgm -define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -509,7 +509,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset: ; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -518,7 +518,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32: ; GCN: ds_min_u32 ; GCN: s_endpgm -define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -526,7 +526,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset: ; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -535,7 +535,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32: ; GCN: ds_max_u32 ; GCN: s_endpgm -define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -543,7 +543,7 @@ ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset: ; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst ret void Index: test/CodeGen/AMDGPU/local-atomics64.ll =================================================================== --- test/CodeGen/AMDGPU/local-atomics64.ll +++ test/CodeGen/AMDGPU/local-atomics64.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64: ; GCN: ds_wrxchg_rtn_b64 ; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset: ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -23,7 +23,7 @@ ; GCN-LABEL: {{^}}lds_atomic_add_ret_i64: ; GCN: ds_add_rtn_u64 ; GCN: s_endpgm -define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -38,7 +38,7 @@ ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 ; GCN: buffer_store_dwordx2 [[RESULT]], ; GCN: s_endpgm -define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -51,7 +51,7 @@ ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} ; GCN: buffer_store_dwordx2 [[RESULT]], ; GCN: s_endpgm -define void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -60,7 +60,7 @@ ; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64_offset: ; GCN: ds_add_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -70,7 +70,7 @@ ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64: ; GCN: ds_sub_rtn_u64 ; GCN: s_endpgm -define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -79,7 +79,7 @@ ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64_offset: ; GCN: ds_sub_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -92,7 +92,7 @@ ; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} ; GCN: buffer_store_dwordx2 [[RESULT]], ; GCN: s_endpgm -define void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -101,7 +101,7 @@ ; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64_offset: ; GCN: ds_sub_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -111,7 +111,7 @@ ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64: ; GCN: ds_and_rtn_b64 ; GCN: s_endpgm -define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -120,7 +120,7 @@ ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64_offset: ; GCN: ds_and_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -130,7 +130,7 @@ ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64: ; GCN: ds_or_rtn_b64 ; GCN: s_endpgm -define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -139,7 +139,7 @@ ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64_offset: ; GCN: ds_or_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -149,7 +149,7 @@ ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64: ; GCN: ds_xor_rtn_b64 ; GCN: s_endpgm -define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -158,7 +158,7 @@ ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64_offset: ; GCN: ds_xor_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -167,7 +167,7 @@ ; FIXME: There is no atomic nand instr ; XGCN-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +; define amdgpu_kernel void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { ; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst ; store i64 %result, i64 addrspace(1)* %out, align 8 ; ret void @@ -176,7 +176,7 @@ ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64: ; GCN: ds_min_rtn_i64 ; GCN: s_endpgm -define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -185,7 +185,7 @@ ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64_offset: ; GCN: ds_min_rtn_i64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -195,7 +195,7 @@ ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64: ; GCN: ds_max_rtn_i64 ; GCN: s_endpgm -define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -204,7 +204,7 @@ ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64_offset: ; GCN: ds_max_rtn_i64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -214,7 +214,7 @@ ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64: ; GCN: ds_min_rtn_u64 ; GCN: s_endpgm -define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -223,7 +223,7 @@ ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64_offset: ; GCN: ds_min_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -233,7 +233,7 @@ ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64: ; GCN: ds_max_rtn_u64 ; GCN: s_endpgm -define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -242,7 +242,7 @@ ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64_offset: ; GCN: ds_max_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -252,7 +252,7 @@ ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64: ; GCN: ds_wrxchg_rtn_b64 ; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -260,7 +260,7 @@ ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset: ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -269,7 +269,7 @@ ; GCN-LABEL: {{^}}lds_atomic_add_noret_i64: ; GCN: ds_add_u64 ; GCN: s_endpgm -define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -282,7 +282,7 @@ ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst ret void @@ -293,7 +293,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}} ; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} ; GCN: s_endpgm -define void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst ret void } @@ -301,7 +301,7 @@ ; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64_offset: ; GCN: ds_add_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst ret void @@ -310,7 +310,7 @@ ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64: ; GCN: ds_sub_u64 ; GCN: s_endpgm -define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -318,7 +318,7 @@ ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64_offset: ; GCN: ds_sub_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -329,7 +329,7 @@ ; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}} ; GCN: ds_sub_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} ; GCN: s_endpgm -define void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst ret void } @@ -337,7 +337,7 @@ ; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64_offset: ; GCN: ds_sub_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst ret void @@ -346,7 +346,7 @@ ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64: ; GCN: ds_and_b64 ; GCN: s_endpgm -define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -354,7 +354,7 @@ ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64_offset: ; GCN: ds_and_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -363,7 +363,7 @@ ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64: ; GCN: ds_or_b64 ; GCN: s_endpgm -define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -371,7 +371,7 @@ ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64_offset: ; GCN: ds_or_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -380,7 +380,7 @@ ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64: ; GCN: ds_xor_b64 ; GCN: s_endpgm -define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -388,7 +388,7 @@ ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64_offset: ; GCN: ds_xor_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -396,7 +396,7 @@ ; FIXME: There is no atomic nand instr ; XGCN-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind { +; define amdgpu_kernel void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst ; ret void ; } @@ -404,7 +404,7 @@ ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64: ; GCN: ds_min_i64 ; GCN: s_endpgm -define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -412,7 +412,7 @@ ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64_offset: ; GCN: ds_min_i64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -421,7 +421,7 @@ ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64: ; GCN: ds_max_i64 ; GCN: s_endpgm -define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -429,7 +429,7 @@ ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64_offset: ; GCN: ds_max_i64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -438,7 +438,7 @@ ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64: ; GCN: ds_min_u64 ; GCN: s_endpgm -define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -446,7 +446,7 @@ ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64_offset: ; GCN: ds_min_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -455,7 +455,7 @@ ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64: ; GCN: ds_max_u64 ; GCN: s_endpgm -define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -463,7 +463,7 @@ ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64_offset: ; GCN: ds_max_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst ret void Index: test/CodeGen/AMDGPU/local-memory.amdgcn.ll =================================================================== --- test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -17,7 +17,7 @@ ; GCN: s_barrier ; GCN: ds_read_b32 {{v[0-9]+}}, -define void @local_memory(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 { entry: %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i @@ -61,7 +61,7 @@ ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]] ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 -define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i Index: test/CodeGen/AMDGPU/local-memory.ll =================================================================== --- test/CodeGen/AMDGPU/local-memory.ll +++ test/CodeGen/AMDGPU/local-memory.ll @@ -14,7 +14,7 @@ ; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4 ; R600: LDS_READ_RET -define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 { +define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 { entry: %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1 %tmp1 = load i32, i32 addrspace(3)* %tmp0 @@ -30,7 +30,7 @@ ; R600: LDS_READ_RET ; GCN-DAG: ds_read_b32 ; GCN-DAG: ds_read2_b32 -define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 { +define amdgpu_kernel void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 { %scalar = load i32, i32 addrspace(3)* %in %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)* %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2 Index: test/CodeGen/AMDGPU/local-memory.r600.ll =================================================================== --- test/CodeGen/AMDGPU/local-memory.r600.ll +++ test/CodeGen/AMDGPU/local-memory.r600.ll @@ -15,7 +15,7 @@ ; EG-NEXT: ALU clause ; EG: LDS_READ_RET -define void @local_memory(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 { entry: %y.i = call i32 @llvm.r600.read.tidig.x() #1 %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i @@ -57,7 +57,7 @@ ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]] ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]] -define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { entry: %x.i = call i32 @llvm.r600.read.tidig.x() #1 %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i Index: test/CodeGen/AMDGPU/loop-address.ll =================================================================== --- test/CodeGen/AMDGPU/loop-address.ll +++ test/CodeGen/AMDGPU/loop-address.ll @@ -5,7 +5,7 @@ ;CHECK: LOOP_BREAK @10 ;CHECK: POP @10 -define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 { +define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 { entry: %cmp5 = icmp sgt i32 %iterations, 0 br i1 %cmp5, label %for.body, label %for.end Index: test/CodeGen/AMDGPU/loop-idiom.ll =================================================================== --- test/CodeGen/AMDGPU/loop-idiom.ll +++ test/CodeGen/AMDGPU/loop-idiom.ll @@ -9,7 +9,7 @@ ; FUNC: @no_memcpy ; R600-NOT: {{^}}llvm.memcpy ; SI-NOT: {{^}}llvm.memcpy -define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) { +define amdgpu_kernel void @no_memcpy(i8 addrspace(3)* %in, i32 %size) { entry: %dest = alloca i8, i32 32 br label %for.body @@ -33,7 +33,7 @@ ; R600-NOT: {{^}}memset_pattern16: ; SI-NOT: {{^}}llvm.memset ; SI-NOT: {{^}}memset_pattern16: -define void @no_memset(i32 %size) { +define amdgpu_kernel void @no_memset(i32 %size) { entry: %dest = alloca i8, i32 32 br label %for.body Index: test/CodeGen/AMDGPU/loop_break.ll =================================================================== --- test/CodeGen/AMDGPU/loop_break.ll +++ test/CodeGen/AMDGPU/loop_break.ll @@ -43,7 +43,7 @@ ; GCN: ; BB#4: ; %bb9 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] ; GCN-NEXT: s_endpgm -define void @break_loop(i32 %arg) #0 { +define amdgpu_kernel void @break_loop(i32 %arg) #0 { bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %tmp = sub i32 %id, %arg @@ -87,7 +87,7 @@ ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) ; OPT-NEXT: store volatile i32 7 ; OPT-NEXT: ret void -define void @undef_phi_cond_break_loop(i32 %arg) #0 { +define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 { bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %tmp = sub i32 %id, %arg @@ -140,7 +140,7 @@ ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) ; OPT-NEXT: store volatile i32 7 ; OPT-NEXT: ret void -define void @constexpr_phi_cond_break_loop(i32 %arg) #0 { +define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 { bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %tmp = sub i32 %id, %arg @@ -190,7 +190,7 @@ ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) ; OPT-NEXT: store volatile i32 7 ; OPT-NEXT: ret void -define void @true_phi_cond_break_loop(i32 %arg) #0 { +define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 { bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %tmp = sub i32 %id, %arg @@ -240,7 +240,7 @@ ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) ; OPT-NEXT: store volatile i32 7 ; OPT-NEXT: ret void -define void @false_phi_cond_break_loop(i32 %arg) #0 { +define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 { bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %tmp = sub i32 %id, %arg @@ -295,7 +295,7 @@ ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %1) ; OPT-NEXT: store volatile i32 7, i32 addrspace(3)* undef ; OPT-NEXT: ret void -define void @invert_true_phi_cond_break_loop(i32 %arg) #0 { +define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 { bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %tmp = sub i32 %id, %arg Index: test/CodeGen/AMDGPU/lower-mem-intrinsics.ll =================================================================== --- test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -9,7 +9,7 @@ ; Test the upper bound for sizes to leave ; OPT-LABEL: @max_size_small_static_memcpy_caller0( ; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false) -define void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { +define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false) ret void } @@ -21,14 +21,14 @@ ; OPT-NEXT: load i8 ; OPT: getelementptr ; OPT-NEXT: store i8 -define void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { +define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false) ret void } ; OPT-LABEL: @max_size_small_static_memmove_caller0( ; OPT: call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false) -define void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { +define amdgpu_kernel void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false) ret void } @@ -39,14 +39,14 @@ ; OPT-NEXT: load i8 ; OPT: getelementptr ; OPT-NEXT: store i8 -define void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { +define amdgpu_kernel void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false) ret void } ; OPT-LABEL: @max_size_small_static_memset_caller0( ; OPT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false) -define void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 { +define amdgpu_kernel void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 { call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false) ret void } @@ -55,7 +55,7 @@ ; OPT-NOT: call ; OPT: getelementptr ; OPT: store i8 -define void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 { +define amdgpu_kernel void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 { call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1025, i32 1, i1 false) ret void } @@ -63,7 +63,7 @@ ; OPT-LABEL: @variable_memcpy_caller0( ; OPT-NOT: call ; OPT: phi -define void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 { +define amdgpu_kernel void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 { call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false) ret void } @@ -71,7 +71,7 @@ ; OPT-LABEL: @variable_memcpy_caller1( ; OPT-NOT: call ; OPT: phi -define void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 { +define amdgpu_kernel void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 { call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false) ret void } @@ -82,7 +82,7 @@ ; OPT-NOT: call ; OPT: phi ; OPT-NOT: call -define void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 { +define amdgpu_kernel void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 { call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false) call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %m, i32 1, i1 false) ret void @@ -94,7 +94,7 @@ ; OPT: load i8, i8 addrspace(3)* ; OPT: getelementptr inbounds i8, i8 addrspace(1)* ; OPT: store i8 -define void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 { +define amdgpu_kernel void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 { call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n, i32 1, i1 false) ret void } @@ -107,7 +107,7 @@ ; OPT: store i8 ; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false) -define void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 { +define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 { call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false) call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false) ret void Index: test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll =================================================================== --- test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll +++ test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range: ; CHECK-NOT: v0 ; CHECK: {{flat|buffer}}_store_dword {{.*}}v0 -define void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 { +define amdgpu_kernel void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 { entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 %and = and i32 %id, 1023 @@ -16,7 +16,7 @@ ; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range: ; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0 ; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]] -define void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 { +define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 { entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 %and = and i32 %id, 511 @@ -28,7 +28,7 @@ ; CHECK-NOT: v0 ; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0 ; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]] -define void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 { +define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 { entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1 %and = and i32 %id, 255 Index: test/CodeGen/AMDGPU/lshr.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/lshr.v2i16.ll +++ test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -12,7 +12,7 @@ ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { +define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = lshr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out ret void @@ -38,7 +38,7 @@ ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -55,7 +55,7 @@ ; GFX9: s_load_dword [[RHS:s[0-9]+]] ; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] -define void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { +define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -70,7 +70,7 @@ ; GFX9: s_load_dword [[LHS:s[0-9]+]] ; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] -define void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { +define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -84,7 +84,7 @@ ; GCN-LABEL: {{^}}lshr_imm_v_v2i16: ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8 -define void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -98,7 +98,7 @@ ; GCN-LABEL: {{^}}lshr_v_imm_v2i16: ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]] -define void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -115,7 +115,7 @@ ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: {{buffer|flat}}_store_dwordx2 -define void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext @@ -133,7 +133,7 @@ ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GCN: {{buffer|flat}}_store_dwordx2 -define void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext Index: test/CodeGen/AMDGPU/mad-combine.ll =================================================================== --- test/CodeGen/AMDGPU/mad-combine.ll +++ test/CodeGen/AMDGPU/mad-combine.ll @@ -31,7 +31,7 @@ ; SI-DENORM: buffer_store_dword [[RESULT]] ; SI-STD: buffer_store_dword [[C]] -define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -70,7 +70,7 @@ ; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI: s_endpgm -define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -107,7 +107,7 @@ ; SI-DENORM: buffer_store_dword [[RESULT]] ; SI-STD: buffer_store_dword [[C]] -define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -137,7 +137,7 @@ ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] ; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -174,7 +174,7 @@ ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI: s_endpgm -define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -209,7 +209,7 @@ ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] ; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -245,7 +245,7 @@ ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI: s_endpgm -define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -281,7 +281,7 @@ ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] ; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -319,7 +319,7 @@ ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI: s_endpgm -define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -362,7 +362,7 @@ ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI: s_endpgm -define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -404,7 +404,7 @@ ; SI-DENORM: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]] ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -447,7 +447,7 @@ ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm -define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -497,7 +497,7 @@ ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm -define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -548,7 +548,7 @@ ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm -define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 Index: test/CodeGen/AMDGPU/mad24-get-global-id.ll =================================================================== --- test/CodeGen/AMDGPU/mad24-get-global-id.ll +++ test/CodeGen/AMDGPU/mad24-get-global-id.ll @@ -11,7 +11,7 @@ ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff ; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]] ; GCN: v_mad_u32_u24 v{{[0-9]+}}, [[VWGSIZEX]], s8, v0 -define void @get_global_id_0(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @get_global_id_0(i32 addrspace(1)* %out) #1 { %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* %gep = getelementptr inbounds i32, i32 addrspace(2)* %cast.dispatch.ptr, i64 1 Index: test/CodeGen/AMDGPU/mad_int24.ll =================================================================== --- test/CodeGen/AMDGPU/mad_int24.ll +++ test/CodeGen/AMDGPU/mad_int24.ll @@ -11,7 +11,7 @@ ; CM: MULADD_INT24 ; SI-NOT: and ; SI: v_mad_i32_i24 -define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: %0 = shl i32 %a, 8 %a_24 = ashr i32 %0, 8 Index: test/CodeGen/AMDGPU/mad_uint24.ll =================================================================== --- test/CodeGen/AMDGPU/mad_uint24.ll +++ test/CodeGen/AMDGPU/mad_uint24.ll @@ -11,7 +11,7 @@ ; SI: v_mad_u32_u24 ; VI: v_mad_u32_u24 -define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: %0 = shl i32 %a, 8 %a_24 = lshr i32 %0, 8 @@ -32,7 +32,7 @@ ; FIXME: Should be using scalar instructions here. ; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16 -define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { +define amdgpu_kernel void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { entry: %0 = mul i16 %a, %b %1 = add i16 %0, %c @@ -49,7 +49,7 @@ ; EG: 8 ; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 -define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { +define amdgpu_kernel void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { entry: %0 = mul i8 %a, %b %1 = add i8 %0, %c @@ -68,7 +68,7 @@ ; FUNC-LABEL: {{^}}i24_i32_i32_mad: ; EG: CNDE_INT ; SI: v_cndmask -define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +define amdgpu_kernel void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { entry: %0 = ashr i32 %a, 8 %1 = icmp ne i32 %c, 0 Index: test/CodeGen/AMDGPU/madak.ll =================================================================== --- test/CodeGen/AMDGPU/madak.ll +++ test/CodeGen/AMDGPU/madak.ll @@ -10,7 +10,7 @@ ; GCN: buffer_load_dword [[VA:v[0-9]+]] ; GCN: buffer_load_dword [[VB:v[0-9]+]] ; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 -define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -37,7 +37,7 @@ ; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]] ; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]] ; GCN: s_endpgm -define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -64,7 +64,7 @@ ; GCN-LABEL: {{^}}madak_m_inline_imm_f32: ; GCN: buffer_load_dword [[VA:v[0-9]+]] ; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 -define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { +define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -84,7 +84,7 @@ ; GCN: buffer_load_dword [[VA:v[0-9]+]] ; GCN: buffer_load_dword [[VB:v[0-9]+]] ; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 -define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -106,7 +106,7 @@ ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] ; GCN-NOT: v_madak_f32 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] -define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind { +define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -125,7 +125,7 @@ ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] ; GCN-NOT: v_madak_f32 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] -define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -141,7 +141,7 @@ ; GCN-LABEL: {{^}}s_s_madak_f32: ; GCN-NOT: v_madak_f32 ; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind { +define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind { %mul = fmul float %a, %b %madak = fadd float %mul, 10.0 store float %madak, float addrspace(1)* %out, align 4 @@ -153,7 +153,7 @@ ; GCN: buffer_load_dword [[VB:v[0-9]+]] ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} ; GCN: s_endpgm -define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -175,7 +175,7 @@ ; GCN: buffer_load_dword [[VB:v[0-9]+]] ; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} ; GCN: s_endpgm -define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -201,7 +201,7 @@ ; GCN: v_madak_f32_e32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]] ; GCN: buffer_store_dword [[MUL]] -define void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 { +define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 { bb: %tmp = icmp eq i32 %arg1, 0 br i1 %tmp, label %bb3, label %bb4 Index: test/CodeGen/AMDGPU/madmk.ll =================================================================== --- test/CodeGen/AMDGPU/madmk.ll +++ test/CodeGen/AMDGPU/madmk.ll @@ -12,7 +12,7 @@ ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]] -define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -35,7 +35,7 @@ ; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]] ; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]] ; GCN: s_endpgm -define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -64,7 +64,7 @@ ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]] -define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -83,7 +83,7 @@ ; GCN-NOT: v_madmk_f32 ; GCN: v_mac_f32_e32 ; GCN: s_endpgm -define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind { +define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -97,7 +97,7 @@ ; GCN-NOT: v_madmk_f32 ; GCN: v_mad_f32 ; GCN: s_endpgm -define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind { +define amdgpu_kernel void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -113,7 +113,7 @@ ; GCN-NOT: v_madmk_f32 ; GCN: v_mac_f32_e32 ; GCN: s_endpgm -define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind { +define amdgpu_kernel void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -130,7 +130,7 @@ ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], |[[VA]]|, [[VB]] -define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -151,7 +151,7 @@ ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}| -define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -172,7 +172,7 @@ ; GCN: buffer_load_dword [[A:v[0-9]+]] ; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0 -define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -189,7 +189,7 @@ ; SI: s_xor_b64 ; SI: v_mac_f32_e32 {{v[0-9]+}}, 0x472aee8c, {{v[0-9]+}} ; SI: s_or_b64 -define void @kill_madmk_verifier_error() nounwind { +define amdgpu_kernel void @kill_madmk_verifier_error() nounwind { bb: br label %bb2 Index: test/CodeGen/AMDGPU/max.i16.ll =================================================================== --- test/CodeGen/AMDGPU/max.i16.ll +++ test/CodeGen/AMDGPU/max.i16.ll @@ -4,7 +4,7 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_imax_sge_i16: ; VIPLUS: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid @@ -23,7 +23,7 @@ ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid @@ -45,7 +45,7 @@ ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %aptr, <3 x i16> addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %aptr, <3 x i16> addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %bptr, i32 %tid @@ -67,7 +67,7 @@ ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %bptr, i32 %tid @@ -83,7 +83,7 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_imax_sgt_i16: ; VIPLUS: v_max_i16_e32 -define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid @@ -99,7 +99,7 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_umax_uge_i16: ; VIPLUS: v_max_u16_e32 -define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid @@ -115,7 +115,7 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_umax_ugt_i16: ; VIPLUS: v_max_u16_e32 -define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid @@ -133,7 +133,7 @@ ; VI: v_max_u16_e32 ; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid Index: test/CodeGen/AMDGPU/max.ll =================================================================== --- test/CodeGen/AMDGPU/max.ll +++ test/CodeGen/AMDGPU/max.ll @@ -6,7 +6,7 @@ ; SI: v_max_i32_e32 ; EG: MAX_INT -define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %cmp = icmp sge i32 %a, %b @@ -26,7 +26,7 @@ ; EG: MAX_INT ; EG: MAX_INT ; EG: MAX_INT -define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind { %a = load <4 x i32>, <4 x i32> addrspace(1)* %aptr, align 4 %b = load <4 x i32>, <4 x i32> addrspace(1)* %bptr, align 4 %cmp = icmp sge <4 x i32> %a, %b @@ -39,7 +39,7 @@ ; SI: s_max_i32 ; EG: MAX_INT -define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp sge i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -50,7 +50,7 @@ ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}} -define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { %cmp = icmp sge i32 %a, 9 %val = select i1 %cmp, i32 %a, i32 9 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -63,7 +63,7 @@ ; SI: v_max_i32_e32 ; EG: MAX_INT -define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { %a = load i8, i8 addrspace(1)* %aptr, align 1 %b = load i8, i8 addrspace(1)* %bptr, align 1 %cmp = icmp sge i8 %a, %b @@ -76,7 +76,7 @@ ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}} -define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { %cmp = icmp sgt i32 %a, 9 %val = select i1 %cmp, i32 %a, i32 9 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -89,7 +89,7 @@ ; EG: MAX_INT {{.*}}literal.{{[xyzw]}} ; EG: MAX_INT {{.*}}literal.{{[xyzw]}} -define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { +define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { %cmp = icmp sgt <2 x i32> %a, %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 @@ -100,7 +100,7 @@ ; SI: v_max_i32_e32 ; EG: MAX_INT -define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %cmp = icmp sgt i32 %a, %b @@ -113,7 +113,7 @@ ; SI: s_max_i32 ; EG: MAX_INT -define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp sgt i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -124,7 +124,7 @@ ; SI: v_max_u32_e32 ; EG: MAX_UINT -define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %cmp = icmp uge i32 %a, %b @@ -137,7 +137,7 @@ ; SI: s_max_u32 ; EG: MAX_UINT -define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp uge i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -155,7 +155,7 @@ ; EG: MAX_UINT ; EG: MAX_UINT ; EG-NOT: MAX_UINT -define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind { +define amdgpu_kernel void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind { %cmp = icmp uge <3 x i32> %a, %b %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b store <3 x i32> %val, <3 x i32> addrspace(1)* %out, align 4 @@ -168,7 +168,7 @@ ; SI: v_max_u32_e32 ; EG: MAX_UINT -define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { %a = load i8, i8 addrspace(1)* %aptr, align 1 %b = load i8, i8 addrspace(1)* %bptr, align 1 %cmp = icmp uge i8 %a, %b @@ -181,7 +181,7 @@ ; SI: v_max_u32_e32 ; EG: MAX_UINT -define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %cmp = icmp ugt i32 %a, %b @@ -194,7 +194,7 @@ ; SI: s_max_u32 ; EG: MAX_UINT -define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp ugt i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -207,7 +207,7 @@ ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}} ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}} -define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { +define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { %cmp = icmp ugt <2 x i32> %a, %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 @@ -223,7 +223,7 @@ ; SI: buffer_store_dword [[VMAX]] ; EG: MAX_UINT -define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { +define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { %a.ext = zext i16 %a to i32 %b.ext = zext i16 %b to i32 %cmp = icmp ugt i32 %a.ext, %b.ext @@ -243,7 +243,7 @@ ; SI: buffer_store_dword [[VMAX]] ; EG: MAX_INT -define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { +define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { %a.ext = sext i16 %a to i32 %b.ext = sext i16 %b to i32 %cmp = icmp sgt i32 %a.ext, %b.ext @@ -262,7 +262,7 @@ ; SI: s_max_i32 ; EG: MAX_INT -define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { +define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { %cmp = icmp sge i16 %a, %b %val = select i1 %cmp, i16 %a, i16 %b store i16 %val, i16 addrspace(1)* %out @@ -275,7 +275,7 @@ ; EG: MAX_UINT ; EG: MAX_UINT -define void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %tmp = icmp ugt i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -287,7 +287,7 @@ ; EG: MAX_UINT ; EG: MAX_UINT -define void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %tmp = icmp uge i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -299,7 +299,7 @@ ; EG-DAG: MAX_UINT ; EG-DAG: MAX_INT -define void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %tmp = icmp sgt i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -311,7 +311,7 @@ ; EG-DAG: MAX_UINT ; EG-DAG: MAX_INT -define void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %tmp = icmp sge i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 Index: test/CodeGen/AMDGPU/max3.ll =================================================================== --- test/CodeGen/AMDGPU/max3.ll +++ test/CodeGen/AMDGPU/max3.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: @v_test_imax3_sgt_i32 ; SI: v_max3_i32 -define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid @@ -23,7 +23,7 @@ ; FUNC-LABEL: @v_test_umax3_ugt_i32 ; SI: v_max3_u32 -define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid Index: test/CodeGen/AMDGPU/mem-builtins.ll =================================================================== --- test/CodeGen/AMDGPU/mem-builtins.ll +++ test/CodeGen/AMDGPU/mem-builtins.ll @@ -9,7 +9,7 @@ ; ERROR: error: :0:0: in function test_memcmp void (i8 addrspace(1)*, i8 addrspace(1)*, i32*): unsupported call to function memcmp -define void @test_memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i32* nocapture %p) #0 { +define amdgpu_kernel void @test_memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i32* nocapture %p) #0 { entry: %cmp = tail call i32 @memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i64 2) store volatile i32 %cmp, i32 addrspace(1)* undef @@ -17,35 +17,35 @@ } ; ERROR: error: :0:0: in function test_memchr void (i8 addrspace(1)*, i32, i64): unsupported call to function memchr -define void @test_memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) #0 { +define amdgpu_kernel void @test_memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) #0 { %res = call i8 addrspace(1)* @memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) store volatile i8 addrspace(1)* %res, i8 addrspace(1)* addrspace(1)* undef ret void } ; ERROR: error: :0:0: in function test_strcpy void (i8*, i8*): unsupported call to function strcpy -define void @test_strcpy(i8* %dst, i8* %src) #0 { +define amdgpu_kernel void @test_strcpy(i8* %dst, i8* %src) #0 { %res = call i8* @strcpy(i8* %dst, i8* %src) store volatile i8* %res, i8* addrspace(1)* undef ret void } ; ERROR: error: :0:0: in function test_strcmp void (i8*, i8*): unsupported call to function strcmp -define void @test_strcmp(i8* %src0, i8* %src1) #0 { +define amdgpu_kernel void @test_strcmp(i8* %src0, i8* %src1) #0 { %res = call i32 @strcmp(i8* %src0, i8* %src1) store volatile i32 %res, i32 addrspace(1)* undef ret void } ; ERROR: error: :0:0: in function test_strlen void (i8*): unsupported call to function strlen -define void @test_strlen(i8* %src) #0 { +define amdgpu_kernel void @test_strlen(i8* %src) #0 { %res = call i32 @strlen(i8* %src) store volatile i32 %res, i32 addrspace(1)* undef ret void } ; ERROR: error: :0:0: in function test_strnlen void (i8*, i32): unsupported call to function strnlen -define void @test_strnlen(i8* %src, i32 %size) #0 { +define amdgpu_kernel void @test_strnlen(i8* %src, i32 %size) #0 { %res = call i32 @strnlen(i8* %src, i32 %size) store volatile i32 %res, i32 addrspace(1)* undef ret void Index: test/CodeGen/AMDGPU/merge-stores.ll =================================================================== --- test/CodeGen/AMDGPU/merge-stores.ll +++ test/CodeGen/AMDGPU/merge-stores.ll @@ -13,7 +13,7 @@ ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: s_endpgm -define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 123, i8 addrspace(1)* %out.gep.1 @@ -25,7 +25,7 @@ ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: s_endpgm -define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 123, i8 addrspace(1)* %out.gep.1 @@ -35,7 +35,7 @@ ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16: ; GCN: buffer_store_dword v -define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 123, i16 addrspace(1)* %out.gep.1 @@ -45,7 +45,7 @@ ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16: ; GCN: buffer_store_dword v -define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 0, i16 addrspace(1)* %out.gep.1 @@ -57,7 +57,7 @@ ; GCN: buffer_store_short ; GCN: buffer_store_short ; GCN: s_endpgm -define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 123, i16 addrspace(1)* %out.gep.1 @@ -69,7 +69,7 @@ ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 store i32 123, i32 addrspace(1)* %out.gep.1 @@ -79,7 +79,7 @@ ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32: ; GCN: buffer_store_dwordx2 -define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* store float 1.0, float addrspace(1)* %out.gep.1.bc @@ -91,7 +91,7 @@ ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* store i32 123, i32 addrspace(1)* %out.gep.1.bc @@ -105,7 +105,7 @@ ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -119,7 +119,7 @@ ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: ; GCN: buffer_store_dwordx4 -define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 @@ -134,7 +134,7 @@ ; First store is out of order. ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: ; GCN: buffer_store_dwordx4 -define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 @@ -149,7 +149,7 @@ ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32: ; GCN-AA: buffer_store_dwordx4 v ; GCN: s_endpgm -define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 @@ -169,7 +169,7 @@ ; SI-DAG: buffer_store_dword ; SI-NOT: buffer_store_dword ; GCN: s_endpgm -define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 @@ -181,7 +181,7 @@ ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: ; GCN: buffer_store_dwordx4 -define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 store i64 123, i64 addrspace(1)* %out.gep.1 @@ -192,7 +192,7 @@ ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 @@ -207,7 +207,7 @@ ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32: ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] ; GCN: buffer_store_dwordx2 [[LOAD]] -define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 @@ -222,7 +222,7 @@ ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base: ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 ; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 @@ -241,7 +241,7 @@ ; GCN: buffer_load_dword v ; GCN: buffer_store_dword v ; GCN: buffer_store_dword v -define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 @@ -256,7 +256,7 @@ ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32: ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] ; GCN: buffer_store_dwordx4 [[LOAD]] -define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -283,7 +283,7 @@ ; SI-DAG: buffer_store_dword v ; SI-DAG: buffer_store_dwordx2 v ; GCN: s_endpgm -define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 @@ -302,7 +302,7 @@ ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32: ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] ; GCN: buffer_store_dwordx4 [[LOAD]] -define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 @@ -325,7 +325,7 @@ ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base: ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 ; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 -define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 @@ -351,7 +351,7 @@ ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] ; GCN: s_barrier ; GCN: buffer_store_dwordx4 [[LOAD]] -define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -388,7 +388,7 @@ ; GCN: buffer_store_dword v ; GCN: buffer_store_dword v ; GCN: buffer_store_dword v -define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -416,7 +416,7 @@ ; GCN: buffer_load_dword [[LOAD:v[0-9]+]] ; GCN: buffer_store_dword [[LOAD]] ; GCN: s_endpgm -define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 @@ -446,7 +446,7 @@ ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: s_endpgm -define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 @@ -470,7 +470,7 @@ ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] ; GCN: buffer_store_dwordx4 [[LOAD]] ; GCN: s_endpgm -define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -492,7 +492,7 @@ ; GCN: ds_write_b8 ; GCN: ds_write_b8 ; GCN: s_endpgm -define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { +define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 store i8 123, i8 addrspace(3)* %out.gep.1 @@ -504,7 +504,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} -define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { +define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 store i32 123, i32 addrspace(3)* %out.gep.1 @@ -522,7 +522,7 @@ ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1 ; GCN: s_endpgm -define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { +define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 @@ -540,7 +540,7 @@ ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} ; GCN: buffer_store_dword v[[HI]] -define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { store i32 9, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 12, i32 addrspace(1)* %idx1, align 4 @@ -556,7 +556,7 @@ ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32: ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx2 -define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { store i32 13, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 15, i32 addrspace(1)* %idx1, align 4 @@ -575,7 +575,7 @@ ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx2 ; GCN: buffer_store_dword v -define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 999, i32 addrspace(1)* %idx1, align 4 @@ -596,7 +596,7 @@ ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 999, i32 addrspace(1)* %idx1, align 4 @@ -630,7 +630,7 @@ ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 ; GCN: ScratchSize: 0{{$}} -define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 store <3 x i32> %vec, <3 x i32> addrspace(1)* %out ret void @@ -646,7 +646,7 @@ ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} ; GCN: ScratchSize: 0{{$}} -define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 store <3 x i64> %vec, <3 x i64> addrspace(1)* %out ret void @@ -662,7 +662,7 @@ ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 ; GCN: ScratchSize: 0{{$}} -define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 %fadd = fadd <3 x float> %vec, store <3 x float> %fadd, <3 x float> addrspace(1)* %out @@ -679,7 +679,7 @@ ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} ; GCN: ScratchSize: 0{{$}} -define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 %fadd = fadd <3 x double> %vec, store <3 x double> %fadd, <3 x double> addrspace(1)* %out Index: test/CodeGen/AMDGPU/min.ll =================================================================== --- test/CodeGen/AMDGPU/min.ll +++ test/CodeGen/AMDGPU/min.ll @@ -7,7 +7,7 @@ ; GCN: v_min_i32_e32 ; EG: MIN_INT -define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid @@ -24,7 +24,7 @@ ; GCN: s_min_i32 ; EG: MIN_INT -define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp sle i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -35,7 +35,7 @@ ; GCN: s_min_i32 ; EG: MIN_INT -define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { +define amdgpu_kernel void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { %cmp = icmp sle <1 x i32> %a, %b %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b store <1 x i32> %val, <1 x i32> addrspace(1)* %out @@ -52,7 +52,7 @@ ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT -define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 { +define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 { %cmp = icmp sle <4 x i32> %a, %b %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b store <4 x i32> %val, <4 x i32> addrspace(1)* %out @@ -65,7 +65,7 @@ ; GCN: s_sext_i32_i8 ; GCN: s_sext_i32_i8 ; GCN: s_min_i32 -define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 { +define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 { %cmp = icmp sle i8 %a, %b %val = select i1 %cmp, i8 %a, i8 %b store i8 %val, i8 addrspace(1)* %out @@ -106,7 +106,7 @@ ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT -define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 { +define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 { %cmp = icmp sle <4 x i8> %a, %b %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b store <4 x i8> %val, <4 x i8> addrspace(1)* %out @@ -124,7 +124,7 @@ ; EG: MIN_INT ; EG: MIN_INT -define void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { +define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { %cmp = icmp sle <2 x i16> %a, %b %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b store <2 x i16> %val, <2 x i16> addrspace(1)* %out @@ -150,7 +150,7 @@ ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT -define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 { +define amdgpu_kernel void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 { %cmp = icmp sle <4 x i16> %a, %b %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b store <4 x i16> %val, <4 x i16> addrspace(1)* %out @@ -161,7 +161,7 @@ ; GCN: v_min_i32_e32 ; EG: MIN_INT -define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 { +define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid @@ -180,7 +180,7 @@ ; GFX89: v_min_i16_e32 ; EG: MIN_INT -define void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 { +define amdgpu_kernel void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %bptr, i32 %tid @@ -198,7 +198,7 @@ ; GCN: s_min_i32 ; EG: MIN_INT -define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp slt i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -211,7 +211,7 @@ ; EG: MIN_INT ; EG: MIN_INT -define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { +define amdgpu_kernel void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %cmp = icmp slt <2 x i32> %a, %b %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b store <2 x i32> %val, <2 x i32> addrspace(1)* %out @@ -222,7 +222,7 @@ ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}} -define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 { +define amdgpu_kernel void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 { %cmp = icmp slt i32 %a, 8 %val = select i1 %cmp, i32 %a, i32 8 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -233,7 +233,7 @@ ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}} -define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 { +define amdgpu_kernel void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 { %cmp = icmp sle i32 %a, 8 %val = select i1 %cmp, i32 %a, i32 8 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -244,7 +244,7 @@ ; GCN: v_min_u32_e32 ; EG: MIN_UINT -define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid @@ -267,7 +267,7 @@ ; EG: MIN_UINT ; EG: MIN_UINT ; EG: MIN_UINT -define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a.ptr, i32 %tid %b.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b.ptr, i32 %tid @@ -301,7 +301,7 @@ ; EG: MIN_UINT ; EG: MIN_UINT ; EG: MIN_UINT -define void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %a.ptr, i32 %tid %b.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %b.ptr, i32 %tid @@ -319,7 +319,7 @@ ; GCN: s_min_u32 ; EG: MIN_UINT -define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp ule i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -330,7 +330,7 @@ ; GCN: v_min_u32_e32 ; EG: MIN_UINT -define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid @@ -353,7 +353,7 @@ ; GFX89: v_min_u16_e32 ; EG: MIN_UINT -define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr inbounds i8, i8 addrspace(1)* %a.ptr, i32 %tid %b.gep = getelementptr inbounds i8, i8 addrspace(1)* %b.ptr, i32 %tid @@ -371,7 +371,7 @@ ; GCN: s_min_u32 ; EG: MIN_UINT -define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp ult i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -386,7 +386,7 @@ ; GCN: s_endpgm ; EG-NOT: MIN_UINT -define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 { +define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %cmp = icmp ult i32 %a, %b @@ -404,7 +404,7 @@ ; GCN: s_endpgm ; EG-NOT: MIN_UINT -define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 { +define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 { %a = load i16, i16 addrspace(1)* %aptr, align 2 %b = load i16, i16 addrspace(1)* %bptr, align 2 %cmp = icmp ult i16 %a, %b @@ -419,7 +419,7 @@ ; GCN: s_min_u32 ; EG: MIN_UINT -define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { +define amdgpu_kernel void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { %cmp = icmp ult <1 x i32> %a, %b %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b store <1 x i32> %val, <1 x i32> addrspace(1)* %out @@ -444,7 +444,7 @@ ; EG: MIN_UINT ; EG: MIN_UINT ; EG: MIN_UINT -define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 { +define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 { %cmp = icmp ult <8 x i32> %a, %b %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b store <8 x i32> %val, <8 x i32> addrspace(1)* %out @@ -478,7 +478,7 @@ ; EG: MIN_UINT ; EG: MIN_UINT ; EG: MIN_UINT -define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 { +define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 { %cmp = icmp ult <8 x i16> %a, %b %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b store <8 x i16> %val, <8 x i16> addrspace(1)* %out @@ -494,7 +494,7 @@ ; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_UINT -define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 { +define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 { %a.ext = zext i16 %a to i32 %b.ext = zext i16 %b to i32 %cmp = icmp ult i32 %a.ext, %b.ext @@ -514,7 +514,7 @@ ; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_INT -define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 { +define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 { %a.ext = sext i16 %a to i32 %b.ext = sext i16 %b to i32 %cmp = icmp slt i32 %a.ext, %b.ext @@ -529,7 +529,7 @@ ; GCN: s_min_i32 ; EG: MIN_INT -define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 { +define amdgpu_kernel void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 { %cmp = icmp sle i16 %a, %b %val = select i1 %cmp, i16 %a, i16 %b store i16 %val, i16 addrspace(1)* %out @@ -542,7 +542,7 @@ ; EG: MIN_UINT ; EG: MIN_UINT -define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %tmp = icmp ult i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -554,7 +554,7 @@ ; EG: MIN_UINT ; EG: MIN_UINT -define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %tmp = icmp ule i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -566,7 +566,7 @@ ; EG-DAG: MIN_UINT ; EG-DAG: MIN_INT -define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %tmp = icmp slt i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -578,7 +578,7 @@ ; EG-DAG: MIN_UINT ; EG-DAG: MIN_INT -define void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %tmp = icmp sle i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -596,7 +596,7 @@ ; EG: MIN_INT ; EG: MIN_INT -define void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid @@ -621,7 +621,7 @@ ; EG: MIN_UINT ; EG: MIN_UINT -define void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid Index: test/CodeGen/AMDGPU/min3.ll =================================================================== --- test/CodeGen/AMDGPU/min3.ll +++ test/CodeGen/AMDGPU/min3.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: @v_test_imin3_slt_i32 ; SI: v_min3_i32 -define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid @@ -23,7 +23,7 @@ ; FUNC-LABEL: @v_test_umin3_ult_i32 ; SI: v_min3_u32 -define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid @@ -43,7 +43,7 @@ ; FUNC-LABEL: @v_test_umin_umin_umin ; SI: v_min_i32 ; SI: v_min3_i32 -define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %tid2 = mul i32 %tid, 2 %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid @@ -77,7 +77,7 @@ ; FUNC-LABEL: @v_test_umin3_2_uses ; SI-NOT: v_min3 -define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %tid2 = mul i32 %tid, 2 %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid Index: test/CodeGen/AMDGPU/missing-store.ll =================================================================== --- test/CodeGen/AMDGPU/missing-store.ll +++ test/CodeGen/AMDGPU/missing-store.ll @@ -15,7 +15,7 @@ ; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}} ; SI: buffer_store_dword ; SI: s_endpgm -define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { +define amdgpu_kernel void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8 %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 Index: test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll =================================================================== --- test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -19,7 +19,7 @@ ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]] ; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, -define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 { +define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 { bb: %tmp = icmp sgt i32 %arg3, 0 br i1 %tmp, label %bb4, label %bb17 Index: test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll =================================================================== --- test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll +++ test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll @@ -11,7 +11,7 @@ ; GCN-LABEL: {{^}}atomic_max_i32: ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400 glc{{$}} -define void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 { +define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep @@ -31,7 +31,7 @@ ; GCN-LABEL: {{^}}atomic_max_i32_noret: ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400{{$}} -define void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 { +define amdgpu_kernel void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep Index: test/CodeGen/AMDGPU/mubuf.ll =================================================================== --- test/CodeGen/AMDGPU/mubuf.ll +++ test/CodeGen/AMDGPU/mubuf.ll @@ -9,7 +9,7 @@ ; MUBUF load with an immediate byte offset that fits into 12-bits ; CHECK-LABEL: {{^}}mubuf_load0: ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0 -define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1 %1 = load i32, i32 addrspace(1)* %0 @@ -20,7 +20,7 @@ ; MUBUF load with the largest possible immediate offset ; CHECK-LABEL: {{^}}mubuf_load1: ; CHECK: buffer_load_ubyte v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0 -define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095 %1 = load i8, i8 addrspace(1)* %0 @@ -32,7 +32,7 @@ ; CHECK-LABEL: {{^}}mubuf_load2: ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0 -define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024 %1 = load i32, i32 addrspace(1)* %0 @@ -44,7 +44,7 @@ ; CHECK-LABEL: {{^}}mubuf_load3: ; CHECK-NOT: ADD ; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0 -define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) { +define amdgpu_kernel void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) { entry: %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1 @@ -91,7 +91,7 @@ ; MUBUF store with an immediate byte offset that fits into 12-bits ; CHECK-LABEL: {{^}}mubuf_store0: ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0 -define void @mubuf_store0(i32 addrspace(1)* %out) { +define amdgpu_kernel void @mubuf_store0(i32 addrspace(1)* %out) { entry: %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1 store i32 0, i32 addrspace(1)* %0 @@ -102,7 +102,7 @@ ; CHECK-LABEL: {{^}}mubuf_store1: ; CHECK: buffer_store_byte v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0 -define void @mubuf_store1(i8 addrspace(1)* %out) { +define amdgpu_kernel void @mubuf_store1(i8 addrspace(1)* %out) { entry: %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095 store i8 0, i8 addrspace(1)* %0 @@ -113,7 +113,7 @@ ; CHECK-LABEL: {{^}}mubuf_store2: ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0 -define void @mubuf_store2(i32 addrspace(1)* %out) { +define amdgpu_kernel void @mubuf_store2(i32 addrspace(1)* %out) { entry: %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024 store i32 0, i32 addrspace(1)* %0 @@ -124,7 +124,7 @@ ; CHECK-LABEL: {{^}}mubuf_store3: ; CHECK-NOT: ADD ; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0 -define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) { +define amdgpu_kernel void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) { entry: %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1 @@ -134,14 +134,14 @@ ; CHECK-LABEL: {{^}}store_sgpr_ptr: ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 -define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 { store i32 99, i32 addrspace(1)* %out, align 4 ret void } ; CHECK-LABEL: {{^}}store_sgpr_ptr_offset: ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40 -define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 { %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10 store i32 99, i32 addrspace(1)* %out.gep, align 4 ret void @@ -150,7 +150,7 @@ ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset: ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] -define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 { %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 store i32 99, i32 addrspace(1)* %out.gep, align 4 ret void @@ -159,7 +159,7 @@ ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic: ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 ; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] -define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 { %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst ret void @@ -167,7 +167,7 @@ ; CHECK-LABEL: {{^}}store_vgpr_ptr: ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 -define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid store i32 99, i32 addrspace(1)* %out.gep, align 4 Index: test/CodeGen/AMDGPU/mul.ll =================================================================== --- test/CodeGen/AMDGPU/mul.ll +++ test/CodeGen/AMDGPU/mul.ll @@ -11,7 +11,7 @@ ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr @@ -31,7 +31,7 @@ ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr @@ -45,7 +45,7 @@ ; SI: s_load_dword ; SI: s_mul_i32 ; SI: buffer_store_dword -define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { %mul = mul i64 %b, %a %trunc = trunc i64 %mul to i32 store i32 %trunc, i32 addrspace(1)* %out, align 8 @@ -57,7 +57,7 @@ ; SI: s_load_dword ; SI: v_mul_lo_i32 ; SI: buffer_store_dword -define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 %mul = mul i64 %b, %a @@ -73,7 +73,7 @@ ; EG-DAG: MULHI_INT ; SI-DAG: s_mul_i32 ; SI-DAG: v_mul_hi_i32 -define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) { entry: %0 = sext i32 %in to i64 %1 = mul i64 %0, 80 @@ -87,7 +87,7 @@ ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_mul_hi_i32 ; SI: s_endpgm -define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { %val = load i32, i32 addrspace(1)* %in, align 4 %ext = sext i32 %val to i64 %mul = mul i64 %ext, 80 @@ -99,7 +99,7 @@ ; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 ; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 ; SI: s_endpgm -define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { %val = load i32, i32 addrspace(1)* %in, align 4 %ext = sext i32 %val to i64 %mul = mul i64 %ext, 9 @@ -114,7 +114,7 @@ ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; SI: buffer_store_dword [[VRESULT]], ; SI: s_endpgm -define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %mul = mul i32 %a, %b store i32 %mul, i32 addrspace(1)* %out, align 4 ret void @@ -122,7 +122,7 @@ ; FUNC-LABEL: {{^}}v_mul_i32: ; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -139,7 +139,7 @@ ; crash with a 'failed to select' error. ; FUNC-LABEL: {{^}}s_mul_i64: -define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %mul = mul i64 %a, %b store i64 %mul, i64 addrspace(1)* %out, align 8 ret void @@ -147,7 +147,7 @@ ; FUNC-LABEL: {{^}}v_mul_i64: ; SI: v_mul_lo_i32 -define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { +define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 %mul = mul i64 %a, %b @@ -157,7 +157,7 @@ ; FUNC-LABEL: {{^}}mul32_in_branch: ; SI: s_mul_i32 -define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) { entry: %0 = icmp eq i32 %a, 0 br i1 %0, label %if, label %else @@ -180,7 +180,7 @@ ; SI-DAG: s_mul_i32 ; SI-DAG: v_mul_hi_u32 ; SI: s_endpgm -define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { +define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else @@ -224,7 +224,7 @@ ; SI: s_mul_i32 ; SI: buffer_store_dwordx4 -define void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 { +define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 { %mul = mul i128 %a, %b store i128 %mul, i128 addrspace(1)* %out ret void @@ -253,7 +253,7 @@ ; SI-DAG: v_mul_lo_i32 ; SI: {{buffer|flat}}_store_dwordx4 -define void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 { +define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid %gep.b = getelementptr inbounds i128, i128 addrspace(1)* %bptr, i32 %tid Index: test/CodeGen/AMDGPU/mul_int24.ll =================================================================== --- test/CodeGen/AMDGPU/mul_int24.ll +++ test/CodeGen/AMDGPU/mul_int24.ll @@ -13,7 +13,7 @@ ; Make sure we are not masking the inputs ; CM-NOT: AND ; CM: MUL_INT24 -define void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %a.shl = shl i32 %a, 8 %a.24 = ashr i32 %a.shl, 8 @@ -39,7 +39,7 @@ ; CM: MULHI_INT24 ; CM: MULHI_INT24 ; CM: MULHI_INT24 -define void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %a.shl = shl i32 %a, 8 %a.24 = ashr i32 %a.shl, 8 @@ -70,7 +70,7 @@ ; GCN-DAG: v_mul_i32_i24_e32 ; GCN: buffer_store_dwordx2 -define void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { %shl.i = shl i32 %a, 8 %shr.i = ashr i32 %shl.i, 8 %conv.i = sext i32 %shr.i to i64 @@ -87,7 +87,7 @@ ; GCN-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]] ; GCN-DAG: v_mul_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]] ; GCN: buffer_store_dwordx2 -define void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { %shl.i = shl i32 %a, 8 %shr.i = ashr i32 %shl.i, 8 %conv.i = sext i32 %shr.i to i64 @@ -112,7 +112,7 @@ ; VI: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}} ; GCN: buffer_store_dwordx2 -define void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 { +define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 { entry: %a.shl = shl i33 %a, 9 %a.24 = ashr i33 %a.shl, 9 @@ -133,7 +133,7 @@ ; SI: v_mul_hi_i32_i24_e32 v[[MUL_HI:[0-9]+]], ; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] ; SI-NEXT: buffer_store_dword v[[HI]] -define void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { +define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { entry: %tmp0 = shl i33 %a, 9 %a_24 = ashr i33 %tmp0, 9 @@ -151,7 +151,7 @@ ; GCN: v_mul_i32_i24_e32 v[[VAL_LO:[0-9]+]] ; GCN: v_mov_b32_e32 v[[VAL_HI:[0-9]+]], v[[VAL_LO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} -define void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) { +define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) { bb: %cmp = icmp eq i32 %arg0, 0 br i1 %cmp, label %bb11, label %bb7 Index: test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll =================================================================== --- test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -6,7 +6,7 @@ ; FUNC-LABEL: {{^}}test_umul24_i32: ; GCN: v_mul_u32_u24 -define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: %0 = shl i32 %a, 8 %a_24 = lshr i32 %0, 8 @@ -22,7 +22,7 @@ ; SI: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16 ; VI: s_mul_i32 [[SI_MUL:s[0-9]]], s{{[0-9]}}, s{{[0-9]}} ; VI: s_sext_i32_i16 s{{[0-9]}}, [[SI_MUL]] -define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) { +define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %mul = mul i16 %a, %b %ext = sext i16 %mul to i32 @@ -34,7 +34,7 @@ ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16 -define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x @@ -54,7 +54,7 @@ ; VI: s_mul_i32 ; VI: s_and_b32 ; VI: v_mov_b32_e32 -define void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) { +define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %mul = mul i16 %a, %b %ext = zext i16 %mul to i32 @@ -66,7 +66,7 @@ ; SI: v_mul_u32_u24_e32 ; SI: v_and_b32_e32 ; VI: v_mul_lo_u16 -define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x @@ -83,7 +83,7 @@ ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 -define void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) { +define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) { entry: %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() @@ -101,7 +101,7 @@ ; GCN-NOT: and ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]], ; GCN-NEXT: buffer_store_dword [[RESULT]] -define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: %a.24 = and i32 %a, 16777215 %b.24 = and i32 %b, 16777215 @@ -118,7 +118,7 @@ ; GCN-NOT: and ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]], ; GCN-NEXT: buffer_store_dword [[RESULT]] -define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) { entry: %a.24 = and i64 %a, 16777215 %b.24 = and i64 %b, 16777215 @@ -136,7 +136,7 @@ ; GCN-DAG: v_mul_u32_u24_e32 ; GCN-DAG: v_mul_hi_u32_u24_e32 ; GCN: buffer_store_dwordx2 -define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { entry: %tmp0 = shl i64 %a, 40 %a_24 = lshr i64 %tmp0, 40 @@ -152,7 +152,7 @@ ; GCN-NOT: s_and_b32 ; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]] ; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]] -define void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) { entry: %tmp0 = shl i64 %a, 40 %a.24 = lshr i64 %tmp0, 40 @@ -166,7 +166,7 @@ ; GCN: s_and_b32 ; GCN: v_mul_u32_u24_e32 [[MUL24:v[0-9]+]] ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[MUL24]] -define void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) { entry: %a.16 = and i32 %a, 65535 %b.16 = and i32 %b, 65535 @@ -186,7 +186,7 @@ ; GCN-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]], ; GCN-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[HI]]{{\]}} -define void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) { +define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) { entry: %tmp0 = shl i33 %a, 9 %a_24 = lshr i33 %tmp0, 9 @@ -206,7 +206,7 @@ ; GCN: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]], ; GCN-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] ; GCN-NEXT: buffer_store_dword v[[HI]] -define void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { +define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { entry: %tmp0 = shl i33 %a, 9 %a_24 = lshr i33 %tmp0, 9 Index: test/CodeGen/AMDGPU/mul_uint24-r600.ll =================================================================== --- test/CodeGen/AMDGPU/mul_uint24-r600.ll +++ test/CodeGen/AMDGPU/mul_uint24-r600.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}test_umul24_i32: ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W -define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: %0 = shl i32 %a, 8 %a_24 = lshr i32 %0, 8 @@ -19,7 +19,7 @@ ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x ; EG: 16 -define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) { +define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %mul = mul i16 %a, %b %ext = sext i16 %mul to i32 @@ -31,7 +31,7 @@ ; FUNC-LABEL: {{^}}test_umul24_i8: ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x -define void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) { +define amdgpu_kernel void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) { entry: %mul = mul i8 %a, %b %ext = sext i8 %mul to i32 @@ -41,7 +41,7 @@ ; FUNC-LABEL: {{^}}test_umulhi24_i32_i64: ; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W -define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: %a.24 = and i32 %a, 16777215 %b.24 = and i32 %b, 16777215 @@ -56,7 +56,7 @@ ; FUNC-LABEL: {{^}}test_umulhi24: ; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y -define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) { entry: %a.24 = and i64 %a, 16777215 %b.24 = and i64 %b, 16777215 @@ -71,7 +71,7 @@ ; FUNC-LABEL: {{^}}test_umul24_i64: ; EG; MUL_UINT24 ; EG: MULHI -define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { entry: %tmp0 = shl i64 %a, 40 %a_24 = lshr i64 %tmp0, 40 Index: test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- test/CodeGen/AMDGPU/multilevel-break.ll +++ test/CodeGen/AMDGPU/multilevel-break.ll @@ -64,7 +64,7 @@ br i1 %tmp51, label %LOOP, label %LOOP.outer } -; OPT-LABEL: define void @multi_if_break_loop( +; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop( ; OPT: llvm.amdgcn.break ; OPT: llvm.amdgcn.loop ; OPT: llvm.amdgcn.if.break @@ -79,7 +79,7 @@ ; Uses a copy intsead of an or ; GCN: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[BREAK_REG]] ; GCN: s_or_b64 [[BREAK_REG]], exec, [[COPY]] -define void @multi_if_break_loop(i32 %arg) #0 { +define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %tmp = sub i32 %id, %arg Index: test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll =================================================================== --- test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll +++ test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll @@ -9,7 +9,7 @@ @extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4 ; CHECK-DAG: Name: load_extern_const_init -define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @load_extern_const_init(i32 addrspace(1)* %out) nounwind { %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4 store i32 %val, i32 addrspace(1)* %out, align 4 ret void @@ -19,7 +19,7 @@ @undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4 ; CHECK-DAG: Name: undef_const_addrspace -define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @load_undef_const_init(i32 addrspace(1)* %out) nounwind { %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4 store i32 %val, i32 addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/no-shrink-extloads.ll =================================================================== --- test/CodeGen/AMDGPU/no-shrink-extloads.ll +++ test/CodeGen/AMDGPU/no-shrink-extloads.ll @@ -9,7 +9,7 @@ ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16: ; SI: s_load_dword s ; SI: buffer_store_short v -define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind { %trunc = trunc i32 %arg to i16 store i16 %trunc, i16 addrspace(1)* %out ret void @@ -21,7 +21,7 @@ ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16: ; SI: buffer_load_dword v ; SI: buffer_store_short v -define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid @@ -34,7 +34,7 @@ ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8: ; SI: s_load_dword s ; SI: buffer_store_byte v -define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind { %trunc = trunc i32 %arg to i8 store i8 %trunc, i8 addrspace(1)* %out ret void @@ -43,7 +43,7 @@ ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8: ; SI: buffer_load_dword v ; SI: buffer_store_byte v -define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid @@ -56,7 +56,7 @@ ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1: ; SI: s_load_dword s ; SI: buffer_store_byte v -define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind { %trunc = trunc i32 %arg to i1 store i1 %trunc, i1 addrspace(1)* %out ret void @@ -65,7 +65,7 @@ ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1: ; SI: buffer_load_dword v ; SI: buffer_store_byte v -define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid @@ -78,7 +78,7 @@ ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32: ; SI: s_load_dword s ; SI: buffer_store_dword v -define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { %trunc = trunc i64 %arg to i32 store i32 %trunc, i32 addrspace(1)* %out ret void @@ -87,7 +87,7 @@ ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32: ; SI: buffer_load_dword v ; SI: buffer_store_dword v -define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -100,7 +100,7 @@ ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32: ; SI: s_load_dword s ; SI: buffer_store_dword v -define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { +define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { %srl = lshr i64 %arg, 32 %trunc = trunc i64 %srl to i32 store i32 %trunc, i32 addrspace(1)* %out @@ -110,7 +110,7 @@ ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32: ; SI: buffer_load_dword v ; SI: buffer_store_dword v -define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -125,7 +125,7 @@ ; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8: ; SI: s_load_dword s ; SI: buffer_store_byte v -define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind { %trunc = trunc i16 %arg to i8 store i8 %trunc, i8 addrspace(1)* %out ret void @@ -134,7 +134,7 @@ ; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8: ; SI: buffer_load_ubyte v ; SI: buffer_store_byte v -define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid @@ -147,7 +147,7 @@ ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8: ; SI: s_load_dword s ; SI: buffer_store_byte v -define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { +define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { %srl = lshr i64 %arg, 32 %trunc = trunc i64 %srl to i8 store i8 %trunc, i8 addrspace(1)* %out @@ -157,7 +157,7 @@ ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8: ; SI: buffer_load_dword v ; SI: buffer_store_byte v -define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid @@ -171,7 +171,7 @@ ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8: ; SI: s_load_dword s ; SI: buffer_store_byte v -define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { %trunc = trunc i64 %arg to i8 store i8 %trunc, i8 addrspace(1)* %out ret void @@ -180,7 +180,7 @@ ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8: ; SI: buffer_load_dword v ; SI: buffer_store_byte v -define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid @@ -194,7 +194,7 @@ ; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0 ; SI: s_waitcnt lgkmcnt(0) ; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff -define void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %val = load i32, i32 addrspace(2)* %in %mask = and i32 %val, 65535 @@ -205,7 +205,7 @@ ; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32: ; SI: buffer_load_dword v ; SI: buffer_store_dword v -define void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in %bc = bitcast <2 x i32> %ld to i64 %hi = lshr i64 %bc, 32 Index: test/CodeGen/AMDGPU/opencl-image-metadata.ll =================================================================== --- test/CodeGen/AMDGPU/opencl-image-metadata.ll +++ test/CodeGen/AMDGPU/opencl-image-metadata.ll @@ -6,7 +6,7 @@ ; EG: CF_END ; SI: s_endpgm -define void @kernel(i32 addrspace(1)* %out) { +define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) { entry: store i32 0, i32 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/operand-folding.ll =================================================================== --- test/CodeGen/AMDGPU/operand-folding.ll +++ test/CodeGen/AMDGPU/operand-folding.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: {{^}}fold_sgpr: ; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s -define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) { +define amdgpu_kernel void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) { entry: %tmp0 = icmp ne i32 %fold, 0 br i1 %tmp0, label %if, label %endif @@ -20,7 +20,7 @@ ; CHECK-LABEL: {{^}}fold_imm: ; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5 -define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) { +define amdgpu_kernel void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) { entry: %fold = add i32 3, 2 %tmp0 = icmp ne i32 %cmp, 0 @@ -46,7 +46,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]] ; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}, -define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) { +define amdgpu_kernel void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) { entry: %tmp0 = add i64 %val, 1 store i64 %tmp0, i64 addrspace(1)* %out @@ -61,7 +61,7 @@ ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} -define void @vector_inline(<4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @vector_inline(<4 x i32> addrspace(1)* %out) { entry: %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp0, 1 @@ -80,7 +80,7 @@ ; CHECK-LABEL: {{^}}imm_one_use: ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}} -define void @imm_one_use(i32 addrspace(1)* %out) { +define amdgpu_kernel void @imm_one_use(i32 addrspace(1)* %out) { entry: %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = xor i32 %tmp0, 100 @@ -94,7 +94,7 @@ ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -define void @vector_imm(<4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) { entry: %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp0, 1 @@ -114,7 +114,7 @@ ; CHECK: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; CHECK: v_mac_f32_e32 v[[LO]], 0x41200000, v[[HI]] ; CHECK: buffer_store_dword v[[LO]] -define void @no_fold_tied_subregister() { +define amdgpu_kernel void @no_fold_tied_subregister() { %tmp1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef %tmp2 = extractelement <2 x float> %tmp1, i32 0 %tmp3 = extractelement <2 x float> %tmp1, i32 1 Index: test/CodeGen/AMDGPU/operand-spacing.ll =================================================================== --- test/CodeGen/AMDGPU/operand-spacing.ll +++ test/CodeGen/AMDGPU/operand-spacing.ll @@ -11,7 +11,7 @@ ; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]] ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]] ; GCN: buffer_store_dword [[RESULT]], -define void @add_f32(float addrspace(1)* %out, float %a, float %b) { +define amdgpu_kernel void @add_f32(float addrspace(1)* %out, float %a, float %b) { %result = fadd float %a, %b store float %result, float addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/optimize-if-exec-masking.mir =================================================================== --- test/CodeGen/AMDGPU/optimize-if-exec-masking.mir +++ test/CodeGen/AMDGPU/optimize-if-exec-masking.mir @@ -3,7 +3,7 @@ --- | target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" - define void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 { main_body: %id = call i32 @llvm.amdgcn.workitem.id.x() %cc = icmp eq i32 %id, 0 @@ -23,7 +23,7 @@ ret void } - define void @optimize_if_and_saveexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -34,7 +34,7 @@ ret void } - define void @optimize_if_or_saveexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_or_saveexec(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -46,7 +46,7 @@ } - define void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 { main_body: %id = call i32 @llvm.amdgcn.workitem.id.x() %cc = icmp eq i32 %id, 0 @@ -67,7 +67,7 @@ ret void } - define void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -78,7 +78,7 @@ ret void } - define void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -89,7 +89,7 @@ ret void } - define void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -100,7 +100,7 @@ ret void } - define void @optimize_if_unknown_saveexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_unknown_saveexec(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -111,7 +111,7 @@ ret void } - define void @optimize_if_andn2_saveexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_andn2_saveexec(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -122,7 +122,7 @@ ret void } - define void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end Index: test/CodeGen/AMDGPU/or.ll =================================================================== --- test/CodeGen/AMDGPU/or.ll +++ test/CodeGen/AMDGPU/or.ll @@ -9,7 +9,7 @@ ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr @@ -28,7 +28,7 @@ ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr @@ -39,7 +39,7 @@ ; FUNC-LABEL: {{^}}scalar_or_i32: ; SI: s_or_b32 -define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %or = or i32 %a, %b store i32 %or, i32 addrspace(1)* %out ret void @@ -47,7 +47,7 @@ ; FUNC-LABEL: {{^}}vector_or_i32: ; SI: v_or_b32_e32 v{{[0-9]}} -define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) { +define amdgpu_kernel void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) { %loada = load i32, i32 addrspace(1)* %a %or = or i32 %loada, %b store i32 %or, i32 addrspace(1)* %out @@ -56,7 +56,7 @@ ; FUNC-LABEL: {{^}}scalar_or_literal_i32: ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f -define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) { %or = or i32 %a, 99999 store i32 %or, i32 addrspace(1)* %out, align 4 ret void @@ -68,7 +68,7 @@ ; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]] ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]] -define void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) { %or = or i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out ret void @@ -82,7 +82,7 @@ ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] -define void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %or = or i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out @@ -101,7 +101,7 @@ ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]] ; SI-NOT: or_b32 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { %or = or i64 %a, 63 store i64 %or, i64 addrspace(1)* %out ret void @@ -111,7 +111,7 @@ ; SI-NOT: or_b32 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 63 ; SI-NOT: or_b32 -define void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %or = or i64 %a, 63 store i64 %or, i64 addrspace(1)* %out %foo = add i64 %b, 63 @@ -125,7 +125,7 @@ ; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}} ; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]] ; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { %or = or i64 %a, -8 store i64 %or, i64 addrspace(1)* %out ret void @@ -133,7 +133,7 @@ ; FUNC-LABEL: {{^}}vector_or_literal_i32: ; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { %loada = load i32, i32 addrspace(1)* %a, align 4 %or = or i32 %loada, 65535 store i32 %or, i32 addrspace(1)* %out, align 4 @@ -142,7 +142,7 @@ ; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32: ; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}} -define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { %loada = load i32, i32 addrspace(1)* %a, align 4 %or = or i32 %loada, 4 store i32 %or, i32 addrspace(1)* %out, align 4 @@ -154,7 +154,7 @@ ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z ; SI: s_or_b64 -define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %or = or i64 %a, %b store i64 %or, i64 addrspace(1)* %out ret void @@ -163,7 +163,7 @@ ; FUNC-LABEL: {{^}}vector_or_i64: ; SI: v_or_b32_e32 v{{[0-9]}} ; SI: v_or_b32_e32 v{{[0-9]}} -define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %loadb = load i64, i64 addrspace(1)* %b, align 8 %or = or i64 %loada, %loadb @@ -174,7 +174,7 @@ ; FUNC-LABEL: {{^}}scalar_vector_or_i64: ; SI: v_or_b32_e32 v{{[0-9]}} ; SI: v_or_b32_e32 v{{[0-9]}} -define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) { +define amdgpu_kernel void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) { %loada = load i64, i64 addrspace(1)* %a %or = or i64 %loada, %b store i64 %or, i64 addrspace(1)* %out @@ -186,7 +186,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]] ; SI: s_endpgm -define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %or = or i64 %loada, 22470723082367 store i64 %or, i64 addrspace(1)* %out @@ -200,7 +200,7 @@ ; SI-NOT: v_or_b32_e32 {{v[0-9]+}}, 0 ; SI: buffer_store_dwordx2 v{{\[}}[[LO_RESULT]]:[[HI_VREG]]{{\]}} ; SI: s_endpgm -define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %or = or i64 %loada, 8 store i64 %or, i64 addrspace(1)* %out @@ -213,7 +213,7 @@ ; SI-DAG: v_mov_b32_e32 v[[RES_HI:[0-9]+]], -1{{$}} ; SI: buffer_store_dwordx2 v{{\[}}[[RES_LO]]:[[RES_HI]]{{\]}} ; SI: s_endpgm -define void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %or = or i64 %loada, -8 store i64 %or, i64 addrspace(1)* %out @@ -226,7 +226,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffff38, v[[LO_VREG]] ; SI: buffer_store_dwordx2 ; SI: s_endpgm -define void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %or = or i64 %loada, -200 store i64 %or, i64 addrspace(1)* %out @@ -239,7 +239,7 @@ ; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]] ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]] ; SI: buffer_store_dword [[VRESULT]], -define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { %add = or i64 %b, %a %trunc = trunc i64 %add to i32 store i32 %trunc, i32 addrspace(1)* %out, align 8 @@ -250,7 +250,7 @@ ; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}} ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] -define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { +define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { %a = load float, float addrspace(1)* %in0 %b = load float, float addrspace(1)* %in1 %acmp = fcmp oge float %a, 0.000000e+00 @@ -263,7 +263,7 @@ ; FUNC-LABEL: {{^}}s_or_i1: ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] -define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +define amdgpu_kernel void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { %cmp0 = icmp eq i32 %a, %b %cmp1 = icmp eq i32 %c, %d %or = or i1 %cmp0, %cmp1 Index: test/CodeGen/AMDGPU/over-max-lds-size.ll =================================================================== --- test/CodeGen/AMDGPU/over-max-lds-size.ll +++ test/CodeGen/AMDGPU/over-max-lds-size.ll @@ -6,7 +6,7 @@ @huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4 -define void @use_huge_lds() { +define amdgpu_kernel void @use_huge_lds() { entry: %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0 store i32 0, i32 addrspace(3)* %v0 Index: test/CodeGen/AMDGPU/pack.v2f16.ll =================================================================== --- test/CodeGen/AMDGPU/pack.v2f16.ll +++ test/CodeGen/AMDGPU/pack.v2f16.ll @@ -9,7 +9,7 @@ ; GFX9: s_load_dword [[VAL1:s[0-9]+]] ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]] ; GFX9: ; use [[PACKED]] -define void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 { +define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 { %val0 = load volatile i32, i32 addrspace(2)* %in0 %val1 = load volatile i32, i32 addrspace(2)* %in1 %lo.i = trunc i32 %val0 to i16 @@ -28,7 +28,7 @@ ; GFX9: s_load_dword [[VAL1:s[0-9]+]] ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1234, [[VAL1]] ; GFX9: ; use [[PACKED]] -define void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 { +define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 { %val1 = load i32, i32 addrspace(2)* %in1 %hi.i = trunc i32 %val1 to i16 %hi = bitcast i16 %hi.i to half @@ -44,7 +44,7 @@ ; GFX9: s_load_dword [[VAL0:s[0-9]+]] ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1234 ; GFX9: ; use [[PACKED]] -define void @s_pack_v2f16_imm_hi(i32 addrspace(2)* %in0) #0 { +define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(2)* %in0) #0 { %val0 = load i32, i32 addrspace(2)* %in0 %lo.i = trunc i32 %val0 to i16 %lo = bitcast i16 %lo.i to half @@ -64,7 +64,7 @@ ; GFX9-FLUSH: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]] ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]] ; GFX9: ; use [[PACKED]] -define void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { +define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext @@ -91,7 +91,7 @@ ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]] ; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]] -define void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { +define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext @@ -118,7 +118,7 @@ ; GFX9-FLUSH-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234{{$}} ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]] ; GFX9: ; use [[PACKED]] -define void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 { +define amdgpu_kernel void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext @@ -140,7 +140,7 @@ ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]] ; GFX9: ; use [[PACKED]] -define void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 { +define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext @@ -164,7 +164,7 @@ ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]] ; GFX9: ; use [[PACKED]] -define void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 { +define amdgpu_kernel void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext @@ -187,7 +187,7 @@ ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]] ; GFX9: ; use [[PACKED]] -define void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 { +define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext @@ -209,7 +209,7 @@ ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], 64, 16, [[MASKED]] ; GFX9: ; use [[PACKED]] -define void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 { +define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext Index: test/CodeGen/AMDGPU/pack.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/pack.v2i16.ll +++ test/CodeGen/AMDGPU/pack.v2i16.ll @@ -9,7 +9,7 @@ ; GFX9: s_load_dword [[VAL1:s[0-9]+]] ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]] ; GFX9: ; use [[PACKED]] -define void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 { +define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 { %val0 = load volatile i32, i32 addrspace(2)* %in0 %val1 = load volatile i32, i32 addrspace(2)* %in1 %lo = trunc i32 %val0 to i16 @@ -26,7 +26,7 @@ ; GFX9: s_load_dword [[VAL1:s[0-9]+]] ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1c8, [[VAL1]] ; GFX9: ; use [[PACKED]] -define void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 { +define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 { %val1 = load i32, i32 addrspace(2)* %in1 %hi = trunc i32 %val1 to i16 %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0 @@ -41,7 +41,7 @@ ; GFX9: s_load_dword [[VAL0:s[0-9]+]] ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1c8 ; GFX9: ; use [[PACKED]] -define void @s_pack_v2i16_imm_hi(i32 addrspace(2)* %in0) #0 { +define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(2)* %in0) #0 { %val0 = load i32, i32 addrspace(2)* %in0 %lo = trunc i32 %val0 to i16 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 @@ -60,7 +60,7 @@ ; GFX9-FLUSH: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]] ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]] ; GFX9: ; use [[PACKED]] -define void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { +define amdgpu_kernel void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext @@ -85,7 +85,7 @@ ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]] ; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]] -define void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { +define amdgpu_kernel void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext @@ -111,7 +111,7 @@ ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]] ; GFX9: ; use [[PACKED]] -define void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 { +define amdgpu_kernel void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext @@ -130,7 +130,7 @@ ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, 64 ; GFX9: ; use [[PACKED]] -define void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 { +define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext @@ -151,7 +151,7 @@ ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[VAL0]] ; GFX9: ; use [[PACKED]] -define void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 { +define amdgpu_kernel void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext @@ -169,7 +169,7 @@ ; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL]], 7 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], 7, 16, [[VAL0]] ; GFX9: ; use [[PACKED]] -define void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0 { +define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext Index: test/CodeGen/AMDGPU/packetizer.ll =================================================================== --- test/CodeGen/AMDGPU/packetizer.ll +++ test/CodeGen/AMDGPU/packetizer.ll @@ -7,7 +7,7 @@ ; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z ; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W -define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) { entry: %shl = sub i32 32, %e %x = add i32 %x_arg, 1 Index: test/CodeGen/AMDGPU/parallelandifcollapse.ll =================================================================== --- test/CodeGen/AMDGPU/parallelandifcollapse.ll +++ test/CodeGen/AMDGPU/parallelandifcollapse.ll @@ -11,7 +11,7 @@ ; to do its transfomation, however now that we are using local memory for ; allocas, the transformation isn't happening. -define void @_Z9chk1D_512v() #0 { +define amdgpu_kernel void @_Z9chk1D_512v() #0 { entry: %a0 = alloca i32, align 4 %b0 = alloca i32, align 4 Index: test/CodeGen/AMDGPU/parallelorifcollapse.ll =================================================================== --- test/CodeGen/AMDGPU/parallelorifcollapse.ll +++ test/CodeGen/AMDGPU/parallelorifcollapse.ll @@ -12,7 +12,7 @@ ; CHECK: OR_INT ; CHECK-NEXT: OR_INT ; CHECK-NEXT: OR_INT -define void @_Z9chk1D_512v() #0 { +define amdgpu_kernel void @_Z9chk1D_512v() #0 { entry: %a0 = alloca i32, align 4 %b0 = alloca i32, align 4 Index: test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll =================================================================== --- test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll +++ test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 -define void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %val = load i64, i64 addrspace(1)* %in.gep Index: test/CodeGen/AMDGPU/predicates.ll =================================================================== --- test/CodeGen/AMDGPU/predicates.ll +++ test/CodeGen/AMDGPU/predicates.ll @@ -6,7 +6,7 @@ ; CHECK-LABEL: {{^}}simple_if: ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @simple_if(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @simple_if(i32 addrspace(1)* %out, i32 %in) { entry: %cmp0 = icmp sgt i32 %in, 0 br i1 %cmp0, label %IF, label %ENDIF @@ -25,7 +25,7 @@ ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @simple_if_else(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp sgt i32 %in, 0 br i1 %0, label %IF, label %ELSE @@ -51,7 +51,7 @@ ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @nested_if(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @nested_if(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp sgt i32 %in, 0 br i1 %0, label %IF0, label %ENDIF @@ -79,7 +79,7 @@ ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @nested_if_else(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp sgt i32 %in, 0 br i1 %0, label %IF0, label %ENDIF Index: test/CodeGen/AMDGPU/private-access-no-objects.ll =================================================================== --- test/CodeGen/AMDGPU/private-access-no-objects.ll +++ test/CodeGen/AMDGPU/private-access-no-objects.ll @@ -18,7 +18,7 @@ ; OPTNONE-NOT: s_mov_b32 ; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} -define void @store_to_undef() #0 { +define amdgpu_kernel void @store_to_undef() #0 { store volatile i32 0, i32* undef ret void } @@ -28,7 +28,7 @@ ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} ; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} -define void @store_to_inttoptr() #0 { +define amdgpu_kernel void @store_to_inttoptr() #0 { store volatile i32 0, i32* inttoptr (i32 123 to i32*) ret void } @@ -38,7 +38,7 @@ ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} -define void @load_from_undef() #0 { +define amdgpu_kernel void @load_from_undef() #0 { %ld = load volatile i32, i32* undef ret void } @@ -48,7 +48,7 @@ ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} -define void @load_from_inttoptr() #0 { +define amdgpu_kernel void @load_from_inttoptr() #0 { %ld = load volatile i32, i32* inttoptr (i32 123 to i32*) ret void } Index: test/CodeGen/AMDGPU/private-element-size.ll =================================================================== --- test/CodeGen/AMDGPU/private-element-size.ll +++ test/CodeGen/AMDGPU/private-element-size.ll @@ -36,7 +36,7 @@ ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} -define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 @@ -106,7 +106,7 @@ ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}} -define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 @@ -143,7 +143,7 @@ ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} -define void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 @@ -179,7 +179,7 @@ ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} -define void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 @@ -228,7 +228,7 @@ ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} -define void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 Index: test/CodeGen/AMDGPU/private-memory-atomics.ll =================================================================== --- test/CodeGen/AMDGPU/private-memory-atomics.ll +++ test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -4,7 +4,7 @@ ; This works because promote allocas pass replaces these with LDS atomics. ; Private atomics have no real use, but at least shouldn't crash on it. -define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind { +define amdgpu_kernel void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind { entry: %tmp = alloca [2 x i32] %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 @@ -17,7 +17,7 @@ ret void } -define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind { +define amdgpu_kernel void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind { entry: %tmp = alloca [2 x i32] %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 Index: test/CodeGen/AMDGPU/private-memory-broken.ll =================================================================== --- test/CodeGen/AMDGPU/private-memory-broken.ll +++ test/CodeGen/AMDGPU/private-memory-broken.ll @@ -7,7 +7,7 @@ declare i32 @foo(i32*) nounwind -define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind { +define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind { entry: %tmp = alloca [2 x i32] %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 Index: test/CodeGen/AMDGPU/private-memory-r600.ll =================================================================== --- test/CodeGen/AMDGPU/private-memory-r600.ll +++ test/CodeGen/AMDGPU/private-memory-r600.ll @@ -16,7 +16,7 @@ ; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0 ; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0 -define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -47,7 +47,7 @@ ; R600-NOT: MOVA_INT %struct.point = type { i32, i32 } -define void @multiple_structs(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 { entry: %a = alloca %struct.point %b = alloca %struct.point @@ -75,7 +75,7 @@ ; FUNC-LABEL: {{^}}direct_loop: ; R600-NOT: MOVA_INT -define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { entry: %prv_array_const = alloca [2 x i32] %prv_array = alloca [2 x i32] @@ -110,7 +110,7 @@ ; FUNC-LABEL: {{^}}short_array: ; R600: MOVA_INT -define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i16] %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0 @@ -127,7 +127,7 @@ ; FUNC-LABEL: {{^}}char_array: ; R600: MOVA_INT -define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i8] %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0 @@ -148,7 +148,7 @@ ; R600-NOT: MOV T0.X ; Additional check in case the move ends up in the last slot ; R600-NOT: MOV * TO.X -define void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 { entry: %0 = alloca [2 x i32] %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0 @@ -169,7 +169,7 @@ ; R600_CHECK: MOV ; R600_CHECK: [[CHAN:[XYZW]]]+ ; R600-NOT: [[CHAN]]+ -define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { entry: %0 = alloca [3 x i8], align 1 %1 = alloca [2 x i8], align 1 @@ -193,7 +193,7 @@ ret void } -define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i8]] %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0 @@ -207,7 +207,7 @@ ret void } -define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i32]] %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 @@ -220,7 +220,7 @@ ret void } -define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i64]] %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0 @@ -235,7 +235,7 @@ %struct.pair32 = type { i32, i32 } -define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x %struct.pair32]] %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1 @@ -248,7 +248,7 @@ ret void } -define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x %struct.pair32] %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1 @@ -261,7 +261,7 @@ ret void } -define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { +define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { entry: %tmp = alloca [2 x i32] %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 @@ -282,7 +282,7 @@ ; SI-NOT: ds_write ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; -define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32] %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a store i32 5, i32* %tmp0 Index: test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll +++ test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: @array_alloca( ; CHECK: %stack = alloca i32, i32 5, align 4 -define void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca i32, i32 5, align 4 %ld0 = load i32, i32 addrspace(1)* %in, align 4 @@ -27,7 +27,7 @@ ; CHECK-LABEL: @array_alloca_dynamic( ; CHECK: %stack = alloca i32, i32 %size, align 4 -define void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 { +define amdgpu_kernel void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 { entry: %stack = alloca i32, i32 %size, align 4 %ld0 = load i32, i32 addrspace(1)* %in, align 4 Index: test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll +++ test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll @@ -7,14 +7,14 @@ declare void @foo.varargs(...) #0 ; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo -define void @crash_call_constexpr_cast() #0 { +define amdgpu_kernel void @crash_call_constexpr_cast() #0 { %alloca = alloca i32 call void bitcast (void (float*)* @foo to void (i32*)*)(i32* %alloca) #0 ret void } ; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs -define void @crash_call_constexpr_cast_varargs() #0 { +define amdgpu_kernel void @crash_call_constexpr_cast_varargs() #0 { %alloca = alloca i32 call void bitcast (void (...)* @foo.varargs to void (i32*)*)(i32* %alloca) #0 ret void Index: test/CodeGen/AMDGPU/promote-alloca-globals.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-globals.ll +++ test/CodeGen/AMDGPU/promote-alloca-globals.ll @@ -5,12 +5,12 @@ @global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4 @global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4 -; IR-LABEL: define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { ; IR: alloca [10 x i32] ; ASM-LABEL: {{^}}promote_alloca_size_256: ; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only) -define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: %stack = alloca [10 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 Index: test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll +++ test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll @@ -7,7 +7,7 @@ ; GCN-LABEL: {{^}}use_invariant_promotable_lds: ; GCN: buffer_load_dword ; GCN: ds_write_b32 -define void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 { +define amdgpu_kernel void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 { bb: %tmp = alloca i32, align 4 %tmp1 = bitcast i32* %tmp to i8* Index: test/CodeGen/AMDGPU/promote-alloca-lifetime.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-lifetime.ll +++ test/CodeGen/AMDGPU/promote-alloca-lifetime.ll @@ -7,7 +7,7 @@ ; OPT-NOT: alloca i32 ; OPT-NOT: llvm.lifetime ; OPT: store i32 %tmp3, i32 addrspace(3)* -define void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 { +define amdgpu_kernel void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 { bb: %tmp = alloca i32, align 4 %tmp1 = bitcast i32* %tmp to i8* Index: test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll +++ test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll @@ -14,7 +14,7 @@ ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false) ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false) -define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %alloca = alloca [17 x i32], align 4 %alloca.bc = bitcast [17 x i32]* %alloca to i8* %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* @@ -28,7 +28,7 @@ ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false) ; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false) -define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %alloca = alloca [17 x i32], align 4 %alloca.bc = bitcast [17 x i32]* %alloca to i8* %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* @@ -41,7 +41,7 @@ ; CHECK-LABEL: @promote_with_memset( ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* %alloca.bc, i8 7, i32 68, i32 4, i1 false) -define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %alloca = alloca [17 x i32], align 4 %alloca.bc = bitcast [17 x i32]* %alloca to i8* %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* @@ -53,7 +53,7 @@ ; CHECK-LABEL: @promote_with_objectsize( ; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false) -define void @promote_with_objectsize(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 { %alloca = alloca [17 x i32], align 4 %alloca.bc = bitcast [17 x i32]* %alloca to i8* %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false) Index: test/CodeGen/AMDGPU/promote-alloca-no-opts.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-no-opts.ll +++ test/CodeGen/AMDGPU/promote-alloca-no-opts.ll @@ -5,7 +5,7 @@ ; NOOPTS: workgroup_group_segment_byte_size = 0{{$}} ; NOOPTS-NOT ds_write ; OPTS: ds_write -define void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i32]] %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 @@ -21,7 +21,7 @@ ; ALL-LABEL: {{^}}optnone_promote_alloca_i32_array_array: ; ALL: workgroup_group_segment_byte_size = 0{{$}} ; ALL-NOT ds_write -define void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 { +define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 { entry: %alloca = alloca [2 x [2 x i32]] %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 Index: test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll +++ test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll @@ -30,7 +30,7 @@ ; GCN-LABEL: {{^}}promote_alloca_size_order_0: ; GCN: workgroup_group_segment_byte_size = 2340 -define void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { +define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4 %tmp0 = load i32, i32 addrspace(1)* %in, align 4 @@ -62,7 +62,7 @@ ; GCN-LABEL: {{^}}promote_alloca_size_order_1: ; GCN: workgroup_group_segment_byte_size = 2352 -define void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { +define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4 %tmp0 = load i32, i32 addrspace(1)* %in, align 4 @@ -100,7 +100,7 @@ ; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit: ; GCN: workgroup_group_segment_byte_size = 1060 -define void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { +define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4 %tmp0 = load i32, i32 addrspace(1)* %in, align 4 Index: test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll +++ test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}stored_lds_pointer_value: ; GCN: buffer_store_dword v -define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 { %tmp = alloca float store float 0.0, float *%tmp store float* %tmp, float* addrspace(1)* %ptr @@ -14,7 +14,7 @@ ; GCN-LABEL: {{^}}stored_lds_pointer_value_offset: ; GCN: buffer_store_dword v -define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 { %tmp0 = alloca float %tmp1 = alloca float store float 0.0, float *%tmp0 @@ -29,7 +29,7 @@ ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GCN: buffer_store_dword v ; GCN: buffer_store_dword v -define void @stored_lds_pointer_value_gep(float* addrspace(1)* %ptr, i32 %idx) #0 { +define amdgpu_kernel void @stored_lds_pointer_value_gep(float* addrspace(1)* %ptr, i32 %idx) #0 { bb: %tmp = alloca float, i32 16 store float 0.0, float* %tmp @@ -46,7 +46,7 @@ ; GCN: buffer_store_dword ; GCN: buffer_store_dword ; GCN: buffer_store_dword -define void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) { entry: %tmp0 = alloca [4 x i32] %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0 @@ -64,7 +64,7 @@ ; GCN-LABEL: {{^}}stored_fi_to_self: ; GCN-NOT: ds_ -define void @stored_fi_to_self() #0 { +define amdgpu_kernel void @stored_fi_to_self() #0 { %tmp = alloca i32* store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp %bitcast = bitcast i32** %tmp to i32* Index: test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll +++ test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll @@ -8,7 +8,7 @@ ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b ; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, %ptr1 -define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b @@ -22,7 +22,7 @@ ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a ; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, null -define void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a %cmp = icmp eq i32* %ptr0, null @@ -35,7 +35,7 @@ ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a ; CHECK: %cmp = icmp eq i32 addrspace(3)* null, %ptr0 -define void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a %cmp = icmp eq i32* null, %ptr0 @@ -49,7 +49,7 @@ ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a ; CHECK: %ptr1 = call i32* @get_unknown_pointer() ; CHECK: %cmp = icmp eq i32* %ptr0, %ptr1 -define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a %ptr1 = call i32* @get_unknown_pointer() Index: test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll +++ test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll @@ -13,7 +13,7 @@ ; CHECK: endif: ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ] ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4 -define void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 { +define amdgpu_kernel void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 { entry: %alloca = alloca [64 x i32], align 4 br i1 undef, label %if, label %else @@ -34,7 +34,7 @@ ; CHECK-LABEL: @branch_ptr_phi_alloca_null_0( ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ null, %entry ] -define void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 { +define amdgpu_kernel void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 { entry: %alloca = alloca [64 x i32], align 4 br i1 undef, label %if, label %endif @@ -51,7 +51,7 @@ ; CHECK-LABEL: @branch_ptr_phi_alloca_null_1( ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ null, %entry ], [ %arrayidx0, %if ] -define void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 { +define amdgpu_kernel void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 { entry: %alloca = alloca [64 x i32], align 4 br i1 undef, label %if, label %endif @@ -73,7 +73,7 @@ ; CHECK: br label %exit ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ] ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4 -define void @one_phi_value(i32 %a) #0 { +define amdgpu_kernel void @one_phi_value(i32 %a) #0 { entry: %alloca = alloca [64 x i32], align 4 %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a @@ -97,7 +97,7 @@ ; CHECK: endif: ; CHECK: %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ] ; CHECK: store i32 0, i32* %phi.ptr, align 4 -define void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 { +define amdgpu_kernel void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 { entry: %alloca = alloca [64 x i32], align 4 br i1 undef, label %if, label %else @@ -134,7 +134,7 @@ ; CHECK-LABEL: @ptr_induction_var_same_alloca( ; CHECK: %alloca = alloca [64 x i32], align 4 ; CHECK: phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] -define void @ptr_induction_var_same_alloca() #0 { +define amdgpu_kernel void @ptr_induction_var_same_alloca() #0 { entry: %alloca = alloca [64 x i32], align 4 %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2 @@ -172,7 +172,7 @@ ; CHECK: %alloca = alloca [64 x i32], align 4 ; CHECK: %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ] ; CHECK: %cmp = icmp eq i32* %incdec.ptr, %call -define void @ptr_induction_var_alloca_unknown() #0 { +define amdgpu_kernel void @ptr_induction_var_alloca_unknown() #0 { entry: %alloca = alloca [64 x i32], align 4 %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2 Index: test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll +++ test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand( ; CHECK: %alloca = alloca i32 ; CHECK: select i1 undef, i32* undef, i32* %alloca -define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 { +define amdgpu_kernel void @lds_promoted_alloca_select_invalid_pointer_operand() #0 { %alloca = alloca i32, align 4 %select = select i1 undef, i32* undef, i32* %alloca store i32 0, i32* %select, align 4 @@ -16,7 +16,7 @@ ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4 -define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 { +define amdgpu_kernel void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b @@ -33,7 +33,7 @@ ; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a ; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b ; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1 -define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 { +define amdgpu_kernel void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 { %alloca0 = alloca i32, i32 16, align 4 %alloca1 = alloca i32, i32 16, align 4 %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a @@ -50,7 +50,7 @@ ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 3 ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4 -define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 { +define amdgpu_kernel void @lds_promote_alloca_select_two_derived_constant_pointers() #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 1 %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 3 @@ -67,7 +67,7 @@ ; CHECK: %select0 = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1 ; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2 ; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4 -define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 { +define amdgpu_kernel void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b @@ -78,7 +78,7 @@ ret void } -define void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 { +define amdgpu_kernel void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 { entry: %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a @@ -102,7 +102,7 @@ ; CHECK-LABEL: @select_null_rhs( ; CHECK-NOT: alloca ; CHECK: select i1 %tmp2, double addrspace(3)* %{{[0-9]+}}, double addrspace(3)* null -define void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 { +define amdgpu_kernel void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 { bb: %tmp = alloca double, align 8 store double 0.000000e+00, double* %tmp, align 8 @@ -117,7 +117,7 @@ ; CHECK-LABEL: @select_null_lhs( ; CHECK-NOT: alloca ; CHECK: select i1 %tmp2, double addrspace(3)* null, double addrspace(3)* %{{[0-9]+}} -define void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 { +define amdgpu_kernel void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 { bb: %tmp = alloca double, align 8 store double 0.000000e+00, double* %tmp, align 8 Index: test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll +++ test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: @try_promote_unhandled_intrinsic( ; CHECK: alloca ; CHECK: call void @llvm.stackrestore(i8* %tmp1) -define void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 { +define amdgpu_kernel void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 { bb: %tmp = alloca i32, align 4 %tmp1 = bitcast i32* %tmp to i8* Index: test/CodeGen/AMDGPU/promote-alloca-volatile.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-volatile.ll +++ test/CodeGen/AMDGPU/promote-alloca-volatile.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: @volatile_load( ; CHECK: alloca [5 x i32] ; CHECK: load volatile i32, i32* -define void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: %stack = alloca [5 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 @@ -16,7 +16,7 @@ ; CHECK-LABEL: @volatile_store( ; CHECK: alloca [5 x i32] ; CHECK: store volatile i32 %tmp, i32* -define void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: %stack = alloca [5 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 @@ -30,7 +30,7 @@ ; CHECK: alloca double ; CHECK: load double ; CHECK: load volatile double -define void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 { +define amdgpu_kernel void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 { bb: %tmp = alloca double, align 8 store double 0.000000e+00, double* %tmp, align 8 Index: test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll =================================================================== --- test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll +++ test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll @@ -2,7 +2,7 @@ ; Don't crash ; CHECK: MAX_UINT -define void @test(i64 addrspace(1)* %out) { +define amdgpu_kernel void @test(i64 addrspace(1)* %out) { bb: store i64 2, i64 addrspace(1)* %out %tmp = load i64, i64 addrspace(1)* %out Index: test/CodeGen/AMDGPU/r600.alu-limits.ll =================================================================== --- test/CodeGen/AMDGPU/r600.alu-limits.ll +++ test/CodeGen/AMDGPU/r600.alu-limits.ll @@ -6,7 +6,7 @@ %struct.foo = type {i32, i32, i32} -define void @alu_limits(i32 addrspace(1)* %out, %struct.foo* %in, i32 %offset) { +define amdgpu_kernel void @alu_limits(i32 addrspace(1)* %out, %struct.foo* %in, i32 %offset) { entry: %ptr = getelementptr inbounds %struct.foo, %struct.foo* %in, i32 1, i32 2 %x = load i32, i32 *%ptr, align 4 Index: test/CodeGen/AMDGPU/r600.bitcast.ll =================================================================== --- test/CodeGen/AMDGPU/r600.bitcast.ll +++ test/CodeGen/AMDGPU/r600.bitcast.ll @@ -8,7 +8,7 @@ ; EG: VTX_READ_128 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)* %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0 @@ -21,7 +21,7 @@ ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { %load = load float, float addrspace(1)* %in, align 4 %bc = bitcast float %load to <2 x i16> store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4 @@ -33,7 +33,7 @@ ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 %bc = bitcast <2 x i16> %load to float store float %bc, float addrspace(1)* %out, align 4 @@ -45,7 +45,7 @@ ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 %bc = bitcast <4 x i8> %load to i32 store i32 %bc, i32 addrspace(1)* %out, align 4 @@ -57,7 +57,7 @@ ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %bc = bitcast i32 %load to <4 x i8> store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4 @@ -69,7 +69,7 @@ ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 %bc = bitcast <2 x i16> %load to <4 x i8> store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4 @@ -85,7 +85,7 @@ ; EG: VTX_READ_16 ; EG-DAG: BFE_UINT ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 2 %bc = bitcast <4 x i16> %load to <8 x i8> %element = extractelement <8 x i8> %bc, i32 5 @@ -98,7 +98,7 @@ ; EG: VTX_READ_64 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 %bc = bitcast <2 x i32> %val to double store double %bc, double addrspace(1)* %out, align 8 Index: test/CodeGen/AMDGPU/r600.global_atomics.ll =================================================================== --- test/CodeGen/AMDGPU/r600.global_atomics.ll +++ test/CodeGen/AMDGPU/r600.global_atomics.ll @@ -6,7 +6,7 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_offset: ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst @@ -16,7 +16,7 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_soffset: ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst @@ -27,7 +27,7 @@ ; FIXME: looks like the offset is wrong ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595 @@ -38,7 +38,7 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset: ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -49,7 +49,7 @@ ; FUNC-LABEL: {{^}}atomic_add_i32: ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -58,7 +58,7 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_addr64: ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -68,7 +68,7 @@ ; FUNC-LABEL: {{^}}atomic_and_i32_offset: ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst @@ -78,7 +78,7 @@ ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset: ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -89,7 +89,7 @@ ; FUNC-LABEL: {{^}}atomic_and_i32: ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -98,7 +98,7 @@ ; FUNC-LABEL: {{^}}atomic_and_i32_addr64: ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -108,7 +108,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32_offset: ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst @@ -118,7 +118,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset: ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -129,7 +129,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32: ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -138,7 +138,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64: ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -148,7 +148,7 @@ ; FUNC-LABEL: {{^}}atomic_max_i32_offset: ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst @@ -158,7 +158,7 @@ ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset: ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -169,7 +169,7 @@ ; FUNC-LABEL: {{^}}atomic_max_i32: ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -178,7 +178,7 @@ ; FUNC-LABEL: {{^}}atomic_max_i32_addr64: ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -188,7 +188,7 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32_offset: ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst @@ -198,7 +198,7 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset: ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -209,7 +209,7 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32: ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -218,7 +218,7 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64: ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -228,7 +228,7 @@ ; FUNC-LABEL: {{^}}atomic_min_i32_offset: ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst @@ -238,7 +238,7 @@ ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset: ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -249,7 +249,7 @@ ; FUNC-LABEL: {{^}}atomic_min_i32: ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -258,7 +258,7 @@ ; FUNC-LABEL: {{^}}atomic_min_i32_addr64: ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -268,7 +268,7 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32_offset: ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst @@ -278,7 +278,7 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset: ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -289,7 +289,7 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32: ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -298,7 +298,7 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64: ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -308,7 +308,7 @@ ; FUNC-LABEL: {{^}}atomic_or_i32_offset: ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst @@ -318,7 +318,7 @@ ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset: ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -329,7 +329,7 @@ ; FUNC-LABEL: {{^}}atomic_or_i32: ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -338,7 +338,7 @@ ; FUNC-LABEL: {{^}}atomic_or_i32_addr64: ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -348,7 +348,7 @@ ; FUNC-LABEL: {{^}}atomic_xchg_i32_offset: ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst @@ -358,7 +358,7 @@ ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset: ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -369,7 +369,7 @@ ; FUNC-LABEL: {{^}}atomic_xchg_i32: ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -378,7 +378,7 @@ ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64: ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -388,7 +388,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset: ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst @@ -398,7 +398,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset: ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -409,7 +409,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32: ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst ret void @@ -418,7 +418,7 @@ ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64: ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst @@ -428,7 +428,7 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32_offset: ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst @@ -438,7 +438,7 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset: ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -449,7 +449,7 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32: ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -458,7 +458,7 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64: ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z -define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -468,7 +468,7 @@ ; FUNC-LABEL: {{^}}atomic_store_i32_offset: ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y -define void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 store atomic i32 %in, i32 addrspace(1)* %gep seq_cst, align 4 @@ -478,7 +478,7 @@ ; FUNC-LABEL: {{^}}atomic_store_i32: ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y -define void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4 ret void @@ -487,7 +487,7 @@ ; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset: ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y -define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -498,7 +498,7 @@ ; FUNC-LABEL: {{^}}atomic_store_i32_addr64: ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y -define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4 @@ -507,7 +507,7 @@ ; FUNC-LABEL: {{^}}atomic_inc_add ; EG: MEM_RAT ATOMIC_INC_UINT -define void @atomic_inc_add(i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_inc_add(i32 addrspace(1)* %out) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 1 seq_cst @@ -516,7 +516,7 @@ ; FUNC-LABEL: {{^}}atomic_dec_add ; EG: MEM_RAT ATOMIC_DEC_UINT -define void @atomic_dec_add(i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_dec_add(i32 addrspace(1)* %out) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 -1 seq_cst @@ -525,7 +525,7 @@ ; FUNC-LABEL: {{^}}atomic_inc_sub ; EG: MEM_RAT ATOMIC_INC_UINT -define void @atomic_inc_sub(i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_inc_sub(i32 addrspace(1)* %out) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 -1 seq_cst @@ -534,7 +534,7 @@ ; FUNC-LABEL: {{^}}atomic_dec_sub ; EG: MEM_RAT ATOMIC_DEC_UINT -define void @atomic_dec_sub(i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_dec_sub(i32 addrspace(1)* %out) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 1 seq_cst Index: test/CodeGen/AMDGPU/r600.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/r600.private-memory.ll +++ test/CodeGen/AMDGPU/r600.private-memory.ll @@ -10,7 +10,7 @@ ; Additional check in case the move ends up in the last slot ; R600-NOT: MOV * TO.X -define void @work_item_info(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) { entry: %0 = alloca [2 x i32] %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0 Index: test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll =================================================================== --- test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll +++ test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll @@ -2,7 +2,7 @@ ; FUNC-LABEL: {{^}}tgid_x: ; EG: MEM_RAT_CACHELESS STORE_RAW T1.X -define void @tgid_x(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tgid_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -11,7 +11,7 @@ ; FUNC-LABEL: {{^}}tgid_y: ; EG: MEM_RAT_CACHELESS STORE_RAW T1.Y -define void @tgid_y(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tgid_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -20,7 +20,7 @@ ; FUNC-LABEL: {{^}}tgid_z: ; EG: MEM_RAT_CACHELESS STORE_RAW T1.Z -define void @tgid_z(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tgid_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -29,7 +29,7 @@ ; FUNC-LABEL: {{^}}tidig_x: ; EG: MEM_RAT_CACHELESS STORE_RAW T0.X -define void @tidig_x(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tidig_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -38,7 +38,7 @@ ; FUNC-LABEL: {{^}}tidig_y: ; EG: MEM_RAT_CACHELESS STORE_RAW T0.Y -define void @tidig_y(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tidig_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -47,7 +47,7 @@ ; FUNC-LABEL: {{^}}tidig_z: ; EG: MEM_RAT_CACHELESS STORE_RAW T0.Z -define void @tidig_z(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tidig_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -57,7 +57,7 @@ ; FUNC-LABEL: {{^}}test_implicit: ; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56 ; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 56 -define void @test_implicit(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 { %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr() %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)* %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 4 @@ -69,7 +69,7 @@ ; FUNC-LABEL: {{^}}test_implicit_dyn: ; 36 prepended implicit bytes + 8(out pointer + in) = 44 ; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44 -define void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 { +define amdgpu_kernel void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 { %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr() %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)* %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 %in Index: test/CodeGen/AMDGPU/rcp-pattern.ll =================================================================== --- test/CodeGen/AMDGPU/rcp-pattern.ll +++ test/CodeGen/AMDGPU/rcp-pattern.ll @@ -9,7 +9,7 @@ ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { %rcp = fdiv float 1.0, %src store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { %rcp = fdiv float 1.0, %src, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -33,7 +33,7 @@ ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { %rcp = fdiv fast float 1.0, %src, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -45,7 +45,7 @@ ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { %rcp = fdiv arcp float 1.0, %src, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -57,7 +57,7 @@ ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 { +define amdgpu_kernel void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 { %rcp = fdiv float 1.0, %src, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -69,7 +69,7 @@ ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 { %src.fabs = call float @llvm.fabs.f32(float %src) %rcp = fdiv float 1.0, %src.fabs store float %rcp, float addrspace(1)* %out, align 4 @@ -82,7 +82,7 @@ ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { %rcp = fdiv float -1.0, %src store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -92,7 +92,7 @@ ; GCN: s_load_dword [[SRC:s[0-9]+]] ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -|[[SRC]]| ; GCN: buffer_store_dword [[RCP]] -define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 { %src.fabs = call float @llvm.fabs.f32(float %src) %src.fabs.fneg = fsub float -0.0, %src.fabs %rcp = fdiv float 1.0, %src.fabs.fneg @@ -106,7 +106,7 @@ ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[SRC]], -|[[SRC]]| ; GCN: buffer_store_dword [[RCP]] ; GCN: buffer_store_dword [[MUL]] -define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 { %src.fabs = call float @llvm.fabs.f32(float %src) %src.fabs.fneg = fsub float -0.0, %src.fabs %rcp = fdiv float 1.0, %src.fabs.fneg @@ -120,7 +120,7 @@ ; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f32: ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}} ; GCN: buffer_store_dword [[MUL]] -define void @div_arcp_2_x_pat_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @div_arcp_2_x_pat_f32(float addrspace(1)* %out) #0 { %x = load float, float addrspace(1)* undef %rcp = fdiv arcp float %x, 2.0 store float %rcp, float addrspace(1)* %out, align 4 @@ -130,7 +130,7 @@ ; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f32: ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0x3dcccccd, v{{[0-9]+}} ; GCN: buffer_store_dword [[MUL]] -define void @div_arcp_k_x_pat_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @div_arcp_k_x_pat_f32(float addrspace(1)* %out) #0 { %x = load float, float addrspace(1)* undef %rcp = fdiv arcp float %x, 10.0 store float %rcp, float addrspace(1)* %out, align 4 @@ -140,7 +140,7 @@ ; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f32: ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbdcccccd, v{{[0-9]+}} ; GCN: buffer_store_dword [[MUL]] -define void @div_arcp_neg_k_x_pat_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @div_arcp_neg_k_x_pat_f32(float addrspace(1)* %out) #0 { %x = load float, float addrspace(1)* undef %rcp = fdiv arcp float %x, -10.0 store float %rcp, float addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll =================================================================== --- test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll +++ test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll @@ -4,7 +4,7 @@ declare i32 @llvm.read_register.i32(metadata) #0 -define void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind { store volatile i32 0, i32 addrspace(3)* undef %m0 = call i32 @llvm.read_register.i32(metadata !0) store i32 %m0, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll =================================================================== --- test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll +++ test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll @@ -4,7 +4,7 @@ declare i32 @llvm.read_register.i32(metadata) #0 -define void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind { store volatile i32 0, i32 addrspace(3)* undef %m0 = call i32 @llvm.read_register.i32(metadata !0) store i32 %m0, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll =================================================================== --- test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll +++ test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll @@ -4,7 +4,7 @@ declare i64 @llvm.read_register.i64(metadata) #0 -define void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 { %exec = call i64 @llvm.read_register.i64(metadata !0) store i64 %exec, i64 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/read_register.ll =================================================================== --- test/CodeGen/AMDGPU/read_register.ll +++ test/CodeGen/AMDGPU/read_register.ll @@ -9,7 +9,7 @@ ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], [[COPY_M0]] ; CHECK: buffer_store_dword [[COPY]] -define void @test_read_m0(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_m0(i32 addrspace(1)* %out) #0 { store volatile i32 0, i32 addrspace(3)* undef %m0 = call i32 @llvm.read_register.i32(metadata !0) store i32 %m0, i32 addrspace(1)* %out @@ -20,7 +20,7 @@ ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], exec_lo ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], exec_hi ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_read_exec(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_exec(i64 addrspace(1)* %out) #0 { %exec = call i64 @llvm.read_register.i64(metadata !1) store i64 %exec, i64 addrspace(1)* %out ret void @@ -30,7 +30,7 @@ ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], flat_scratch_lo ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], flat_scratch_hi ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 { %flat_scratch = call i64 @llvm.read_register.i64(metadata !2) store i64 %flat_scratch, i64 addrspace(1)* %out ret void @@ -39,7 +39,7 @@ ; CHECK-LABEL: {{^}}test_read_flat_scratch_lo: ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_lo ; CHECK: buffer_store_dword [[COPY]] -define void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 { %flat_scratch_lo = call i32 @llvm.read_register.i32(metadata !3) store i32 %flat_scratch_lo, i32 addrspace(1)* %out ret void @@ -48,7 +48,7 @@ ; CHECK-LABEL: {{^}}test_read_flat_scratch_hi: ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_hi ; CHECK: buffer_store_dword [[COPY]] -define void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 { %flat_scratch_hi = call i32 @llvm.read_register.i32(metadata !4) store i32 %flat_scratch_hi, i32 addrspace(1)* %out ret void @@ -57,7 +57,7 @@ ; CHECK-LABEL: {{^}}test_read_exec_lo: ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_lo ; CHECK: buffer_store_dword [[COPY]] -define void @test_read_exec_lo(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_exec_lo(i32 addrspace(1)* %out) #0 { %exec_lo = call i32 @llvm.read_register.i32(metadata !5) store i32 %exec_lo, i32 addrspace(1)* %out ret void @@ -66,7 +66,7 @@ ; CHECK-LABEL: {{^}}test_read_exec_hi: ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_hi ; CHECK: buffer_store_dword [[COPY]] -define void @test_read_exec_hi(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_exec_hi(i32 addrspace(1)* %out) #0 { %exec_hi = call i32 @llvm.read_register.i32(metadata !6) store i32 %exec_hi, i32 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/readcyclecounter.ll =================================================================== --- test/CodeGen/AMDGPU/readcyclecounter.ll +++ test/CodeGen/AMDGPU/readcyclecounter.ll @@ -13,7 +13,7 @@ ; SI: s_memtime s{{\[[0-9]+:[0-9]+\]}} ; VI: s_memrealtime s{{\[[0-9]+:[0-9]+\]}} ; GCN: store_dwordx2 -define void @test_readcyclecounter(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 { %cycle0 = call i64 @llvm.readcyclecounter() store volatile i64 %cycle0, i64 addrspace(1)* %out Index: test/CodeGen/AMDGPU/reduce-load-width-alignment.ll =================================================================== --- test/CodeGen/AMDGPU/reduce-load-width-alignment.ll +++ test/CodeGen/AMDGPU/reduce-load-width-alignment.ll @@ -6,7 +6,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, [[VAL]] ; GCN: buffer_store_dwordx2 -define void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %a = load i64, i64 addrspace(1)* %in, align 4 %and = and i64 %a, 1234567 store i64 %and, i64 addrspace(1)* %out, align 8 @@ -16,7 +16,7 @@ ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt0: ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: buffer_store_dword [[VAL]] -define void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %a = load i64, i64 addrspace(1)* %in, align 4 %vec = bitcast i64 %a to <2 x i32> %elt0 = extractelement <2 x i32> %vec, i32 0 @@ -27,7 +27,7 @@ ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt1: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 ; GCN: buffer_store_dword [[VAL]] -define void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %a = load i64, i64 addrspace(1)* %in, align 4 %vec = bitcast i64 %a to <2 x i32> %elt0 = extractelement <2 x i32> %vec, i32 1 Index: test/CodeGen/AMDGPU/reduce-store-width-alignment.ll =================================================================== --- test/CodeGen/AMDGPU/reduce-store-width-alignment.ll +++ test/CodeGen/AMDGPU/reduce-store-width-alignment.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}store_v2i32_as_v4i16_align_4: ; GCN: s_load_dwordx2 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} -define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 { +define amdgpu_kernel void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 { %x.bc = bitcast <2 x i32> %x to <4 x i16> store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4 ret void @@ -13,7 +13,7 @@ ; GCN: s_load_dwordx4 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} -define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 { +define amdgpu_kernel void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 { %x.bc = bitcast <4 x i32> %x to <8 x i16> store <8 x i16> %x.bc, <8 x i16> addrspace(3)* %out, align 4 ret void @@ -22,7 +22,7 @@ ; GCN-LABEL: {{^}}store_v2i32_as_i64_align_4: ; GCN: s_load_dwordx2 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} -define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 { +define amdgpu_kernel void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 { %x.bc = bitcast <2 x i32> %x to <4 x i16> store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4 ret void @@ -32,7 +32,7 @@ ; GCN: s_load_dwordx4 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} -define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 { +define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 { %x.bc = bitcast <4 x i32> %x to <2 x i64> store <2 x i64> %x.bc, <2 x i64> addrspace(3)* %out, align 4 ret void @@ -44,7 +44,7 @@ ; GCN: buffer_load_ushort ; GCN: buffer_load_ushort ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} -define void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 { +define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 { %x.bc = bitcast <4 x i16> %x to <2 x i32> store <2 x i32> %x.bc, <2 x i32> addrspace(3)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll =================================================================== --- test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll +++ test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 -define void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 { +define amdgpu_kernel void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 { bb: %id.x = call i32 @llvm.amdgcn.workitem.id.x() %cmp0 = icmp eq i32 %id.x, 0 Index: test/CodeGen/AMDGPU/regcoalesce-dbg.mir =================================================================== --- test/CodeGen/AMDGPU/regcoalesce-dbg.mir +++ test/CodeGen/AMDGPU/regcoalesce-dbg.mir @@ -8,7 +8,7 @@ # CHECK: DBG_VALUE{{.*}}debug-use %13.sub2 --- | - define void @test(i32 addrspace(1)* %out) { ret void } + define amdgpu_kernel void @test(i32 addrspace(1)* %out) { ret void } !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !4, producer: "llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4) !1 = !DILocalVariable(name: "a", scope: !2, file: !4, line: 126, type: !6) Index: test/CodeGen/AMDGPU/register-count-comments.ll =================================================================== --- test/CodeGen/AMDGPU/register-count-comments.ll +++ test/CodeGen/AMDGPU/register-count-comments.ll @@ -9,7 +9,7 @@ ; SI: ; Kernel info: ; SI: ; NumSgprs: {{[0-9]+}} ; SI: ; NumVgprs: {{[0-9]+}} -define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind { +define amdgpu_kernel void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind { %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0); %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo) %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid @@ -24,7 +24,7 @@ ; SI-LABEL: {{^}}one_vgpr_used: ; SI: NumVgprs: 1 -define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind { +define amdgpu_kernel void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind { store i32 %x, i32 addrspace(1)* %out, align 4 ret void } Index: test/CodeGen/AMDGPU/rename-disconnected-bug.ll =================================================================== --- test/CodeGen/AMDGPU/rename-disconnected-bug.ll +++ test/CodeGen/AMDGPU/rename-disconnected-bug.ll @@ -3,7 +3,7 @@ ; definition on every path (there should at least be IMPLICIT_DEF instructions). target triple = "amdgcn--" -define void @func() { +define amdgpu_kernel void @func() { B0: br i1 undef, label %B1, label %B2 Index: test/CodeGen/AMDGPU/rename-independent-subregs.mir =================================================================== --- test/CodeGen/AMDGPU/rename-independent-subregs.mir +++ test/CodeGen/AMDGPU/rename-independent-subregs.mir @@ -1,7 +1,7 @@ # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass simple-register-coalescing,rename-independent-subregs -o - %s | FileCheck %s --- | - define void @test0() { ret void } - define void @test1() { ret void } + define amdgpu_kernel void @test0() { ret void } + define amdgpu_kernel void @test1() { ret void } ... --- # In the test below we have two independent def+use pairs of subregister1 which Index: test/CodeGen/AMDGPU/reorder-stores.ll =================================================================== --- test/CodeGen/AMDGPU/reorder-stores.ll +++ test/CodeGen/AMDGPU/reorder-stores.ll @@ -7,7 +7,7 @@ ; SI: buffer_store_dwordx4 ; SI: buffer_store_dwordx4 ; SI: s_endpgm -define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind { +define amdgpu_kernel void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind { %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16 %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16 store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16 @@ -19,7 +19,7 @@ ; SI: ds_read2_b64 ; SI: ds_write2_b64 ; SI: s_endpgm -define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind { +define amdgpu_kernel void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind { %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16 %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16 store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16 @@ -39,7 +39,7 @@ ; SI: buffer_store_dwordx4 ; SI: buffer_store_dwordx4 ; SI: s_endpgm -define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind { +define amdgpu_kernel void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind { %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32 %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32 store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32 @@ -54,7 +54,7 @@ ; SI-NOT: ds_read ; SI: ds_write_b64 ; SI: s_endpgm -define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind { +define amdgpu_kernel void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind { %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8 %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8 %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64> Index: test/CodeGen/AMDGPU/rotl.i64.ll =================================================================== --- test/CodeGen/AMDGPU/rotl.i64.ll +++ test/CodeGen/AMDGPU/rotl.i64.ll @@ -7,7 +7,7 @@ ; BOTH-DAG: s_lshr_b64 ; BOTH: s_or_b64 ; BOTH: s_endpgm -define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { +define amdgpu_kernel void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { entry: %0 = shl i64 %x, %y %1 = sub i64 64, %y @@ -26,7 +26,7 @@ ; BOTH: v_or_b32 ; BOTH: v_or_b32 ; BOTH: s_endpgm -define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { +define amdgpu_kernel void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { entry: %x = load i64, i64 addrspace(1)* %xptr, align 8 %y = load i64, i64 addrspace(1)* %yptr, align 8 Index: test/CodeGen/AMDGPU/rotl.ll =================================================================== --- test/CodeGen/AMDGPU/rotl.ll +++ test/CodeGen/AMDGPU/rotl.ll @@ -10,7 +10,7 @@ ; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}} ; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]] ; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]] -define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { +define amdgpu_kernel void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { entry: %0 = shl i32 %x, %y %1 = sub i32 32, %y @@ -26,7 +26,7 @@ ; SI-DAG: v_alignbit_b32 ; SI-DAG: v_alignbit_b32 ; SI: s_endpgm -define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { +define amdgpu_kernel void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { entry: %0 = shl <2 x i32> %x, %y %1 = sub <2 x i32> , %y @@ -46,7 +46,7 @@ ; SI-DAG: s_sub_i32 ; SI-DAG: v_alignbit_b32 ; SI: s_endpgm -define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { +define amdgpu_kernel void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { entry: %0 = shl <4 x i32> %x, %y %1 = sub <4 x i32> , %y Index: test/CodeGen/AMDGPU/rotr.i64.ll =================================================================== --- test/CodeGen/AMDGPU/rotr.i64.ll +++ test/CodeGen/AMDGPU/rotr.i64.ll @@ -6,7 +6,7 @@ ; BOTH-DAG: s_lshr_b64 ; BOTH-DAG: s_lshl_b64 ; BOTH: s_or_b64 -define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { +define amdgpu_kernel void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { entry: %tmp0 = sub i64 64, %y %tmp1 = shl i64 %x, %tmp0 @@ -24,7 +24,7 @@ ; VI-DAG: v_lshlrev_b64 ; BOTH: v_or_b32 ; BOTH: v_or_b32 -define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { +define amdgpu_kernel void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { entry: %x = load i64, i64 addrspace(1)* %xptr, align 8 %y = load i64, i64 addrspace(1)* %yptr, align 8 @@ -37,7 +37,7 @@ } ; BOTH-LABEL: {{^}}s_rotr_v2i64: -define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) { +define amdgpu_kernel void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) { entry: %tmp0 = sub <2 x i64> , %y %tmp1 = shl <2 x i64> %x, %tmp0 @@ -48,7 +48,7 @@ } ; BOTH-LABEL: {{^}}v_rotr_v2i64: -define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) { +define amdgpu_kernel void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) { entry: %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8 %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8 Index: test/CodeGen/AMDGPU/rotr.ll =================================================================== --- test/CodeGen/AMDGPU/rotr.ll +++ test/CodeGen/AMDGPU/rotr.ll @@ -6,7 +6,7 @@ ; R600: BIT_ALIGN_INT ; SI: v_alignbit_b32 -define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { +define amdgpu_kernel void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { entry: %tmp0 = sub i32 32, %y %tmp1 = shl i32 %x, %tmp0 @@ -22,7 +22,7 @@ ; SI: v_alignbit_b32 ; SI: v_alignbit_b32 -define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { +define amdgpu_kernel void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { entry: %tmp0 = sub <2 x i32> , %y %tmp1 = shl <2 x i32> %x, %tmp0 @@ -42,7 +42,7 @@ ; SI: v_alignbit_b32 ; SI: v_alignbit_b32 ; SI: v_alignbit_b32 -define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { +define amdgpu_kernel void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { entry: %tmp0 = sub <4 x i32> , %y %tmp1 = shl <4 x i32> %x, %tmp0 Index: test/CodeGen/AMDGPU/rsq.ll =================================================================== --- test/CodeGen/AMDGPU/rsq.ll +++ test/CodeGen/AMDGPU/rsq.ll @@ -8,7 +8,7 @@ ; SI-LABEL: {{^}}rsq_f32: ; SI: v_rsq_f32_e32 ; SI: s_endpgm -define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %val = load float, float addrspace(1)* %in, align 4 %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone %div = fdiv float 1.0, %sqrt @@ -20,7 +20,7 @@ ; SI-UNSAFE: v_rsq_f64_e32 ; SI-SAFE: v_sqrt_f64_e32 ; SI: s_endpgm -define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { %val = load double, double addrspace(1)* %in, align 4 %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone %div = fdiv double 1.0, %sqrt @@ -31,7 +31,7 @@ ; SI-LABEL: {{^}}rsq_f32_sgpr: ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} ; SI: s_endpgm -define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind { +define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind { %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone %div = fdiv float 1.0, %sqrt store float %div, float addrspace(1)* %out, align 4 @@ -55,7 +55,7 @@ ; SI-SAFE-NOT: v_rsq_f32 ; SI: s_endpgm -define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -81,7 +81,7 @@ ; SI-UNSAFE: v_rsq_f32_e32 [[RSQ:v[0-9]+]], v{{[0-9]+}} ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]] ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]] -define void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %val = load float, float addrspace(1)* %in, align 4 %sqrt = call float @llvm.sqrt.f32(float %val) %div = fdiv float -1.0, %sqrt @@ -96,7 +96,7 @@ ; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}} ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]] ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]] -define void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { %val = load double, double addrspace(1)* %in, align 4 %sqrt = call double @llvm.sqrt.f64(double %val) %div = fdiv double -1.0, %sqrt @@ -112,7 +112,7 @@ ; SI-UNSAFE: v_rsq_f32_e64 [[RSQ:v[0-9]+]], -v{{[0-9]+}} ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]] ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]] -define void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %val = load float, float addrspace(1)* %in, align 4 %val.fneg = fsub float -0.0, %val %sqrt = call float @llvm.sqrt.f32(float %val.fneg) @@ -128,7 +128,7 @@ ; SI-UNSAFE: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -v{{\[[0-9]+:[0-9]+\]}} ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]] ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]] -define void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { %val = load double, double addrspace(1)* %in, align 4 %val.fneg = fsub double -0.0, %val %sqrt = call double @llvm.sqrt.f64(double %val.fneg) Index: test/CodeGen/AMDGPU/s_addk_i32.ll =================================================================== --- test/CodeGen/AMDGPU/s_addk_i32.ll +++ test/CodeGen/AMDGPU/s_addk_i32.ll @@ -7,7 +7,7 @@ ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[VRESULT]] ; SI: s_endpgm -define void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) { %add = add i32 %b, 65 store i32 %add, i32 addrspace(1)* %out ret void @@ -19,7 +19,7 @@ ; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]] ; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]] ; SI: s_endpgm -define void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) { %add0 = add i32 %a, 65 %add1 = add i32 %b, 65 store i32 %add0, i32 addrspace(1)* %out0 @@ -30,7 +30,7 @@ ; SI-LABEL: {{^}}s_addk_i32_k1: ; SI: s_addk_i32 {{s[0-9]+}}, 0x7fff{{$}} ; SI: s_endpgm -define void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) { %add = add i32 %b, 32767 ; (1 << 15) - 1 store i32 %add, i32 addrspace(1)* %out ret void @@ -39,7 +39,7 @@ ; SI-LABEL: {{^}}s_addk_i32_k2: ; SI: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, 17 ; SI: s_endpgm -define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) { %add = add i32 %b, -17 store i32 %add, i32 addrspace(1)* %out ret void @@ -48,7 +48,7 @@ ; SI-LABEL: {{^}}s_addk_i32_k3: ; SI: s_addk_i32 {{s[0-9]+}}, 0xffbf{{$}} ; SI: s_endpgm -define void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) { %add = add i32 %b, -65 store i32 %add, i32 addrspace(1)* %out ret void @@ -58,7 +58,7 @@ ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42 ; SI: s_endpgm -define void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) { +define amdgpu_kernel void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) { %add = add <2 x i32> %b, store <2 x i32> %add, <2 x i32> addrspace(1)* %out ret void @@ -70,7 +70,7 @@ ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44 ; SI: s_endpgm -define void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) { +define amdgpu_kernel void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) { %add = add <4 x i32> %b, store <4 x i32> %add, <4 x i32> addrspace(1)* %out ret void @@ -86,7 +86,7 @@ ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x47 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x48 ; SI: s_endpgm -define void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) { +define amdgpu_kernel void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) { %add = add <8 x i32> %b, store <8 x i32> %add, <8 x i32> addrspace(1)* %out ret void @@ -95,7 +95,7 @@ ; SI-LABEL: {{^}}no_s_addk_i32_k0: ; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}} ; SI: s_endpgm -define void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) { %add = add i32 %b, 32768 ; 1 << 15 store i32 %add, i32 addrspace(1)* %out ret void @@ -105,7 +105,7 @@ ; SI-LABEL: {{^}}commute_s_addk_i32: ; SI: s_addk_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 { +define amdgpu_kernel void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 { %size = call i32 @llvm.amdgcn.groupstaticsize() %add = add i32 %size, %b call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add) Index: test/CodeGen/AMDGPU/s_movk_i32.ll =================================================================== --- test/CodeGen/AMDGPU/s_movk_i32.ll +++ test/CodeGen/AMDGPU/s_movk_i32.ll @@ -7,7 +7,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32) store i64 %or, i64 addrspace(1)* %out @@ -21,7 +21,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32) store i64 %or, i64 addrspace(1)* %out @@ -35,7 +35,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32) store i64 %or, i64 addrspace(1)* %out @@ -49,7 +49,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32) store i64 %or, i64 addrspace(1)* %out @@ -63,7 +63,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32) store i64 %or, i64 addrspace(1)* %out @@ -78,7 +78,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff store i64 %or, i64 addrspace(1)* %out @@ -92,7 +92,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 270582939713 ; 65 | (63 << 32) store i64 %or, i64 addrspace(1)* %out @@ -107,7 +107,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32) store i64 %or, i64 addrspace(1)* %out @@ -122,7 +122,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000 store i64 %or, i64 addrspace(1)* %out @@ -137,7 +137,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001 store i64 %or, i64 addrspace(1)* %out @@ -152,7 +152,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888 store i64 %or, i64 addrspace(1)* %out @@ -167,7 +167,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff store i64 %or, i64 addrspace(1)* %out @@ -182,7 +182,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001 store i64 %or, i64 addrspace(1)* %out Index: test/CodeGen/AMDGPU/s_mulk_i32.ll =================================================================== --- test/CodeGen/AMDGPU/s_mulk_i32.ll +++ test/CodeGen/AMDGPU/s_mulk_i32.ll @@ -7,7 +7,7 @@ ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[VRESULT]] ; SI: s_endpgm -define void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) { %mul = mul i32 %b, 65 store i32 %mul, i32 addrspace(1)* %out ret void @@ -16,7 +16,7 @@ ; SI-LABEL: {{^}}s_mulk_i32_k1: ; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}} ; SI: s_endpgm -define void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) { %mul = mul i32 %b, 32767 ; (1 << 15) - 1 store i32 %mul, i32 addrspace(1)* %out ret void @@ -25,7 +25,7 @@ ; SI-LABEL: {{^}}s_mulk_i32_k2: ; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}} ; SI: s_endpgm -define void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) { %mul = mul i32 %b, -17 store i32 %mul, i32 addrspace(1)* %out ret void @@ -34,7 +34,7 @@ ; SI-LABEL: {{^}}no_s_mulk_i32_k0: ; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}} ; SI: s_endpgm -define void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) { %mul = mul i32 %b, 32769 ; 1 << 15 + 1 store i32 %mul, i32 addrspace(1)* %out ret void @@ -44,7 +44,7 @@ ; SI-LABEL: {{^}}commute_s_mulk_i32: ; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 { +define amdgpu_kernel void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 { %size = call i32 @llvm.amdgcn.groupstaticsize() %add = mul i32 %size, %b call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add) Index: test/CodeGen/AMDGPU/sad.ll =================================================================== --- test/CodeGen/AMDGPU/sad.ll +++ test/CodeGen/AMDGPU/sad.ll @@ -2,7 +2,7 @@ ; GCN-LABEL: {{^}}v_sad_u32_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -18,7 +18,7 @@ ; GCN-LABEL: {{^}}v_sad_u32_constant_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 20 -define void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) { %icmp0 = icmp ugt i32 %a, 90 %t0 = select i1 %icmp0, i32 %a, i32 90 @@ -34,7 +34,7 @@ ; GCN-LABEL: {{^}}v_sad_u32_pat2: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b %sub1 = sub i32 %b, %a @@ -51,7 +51,7 @@ ; GCN: s_min_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -define void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -68,7 +68,7 @@ ; GCN-LABEL: {{^}}v_sad_u32_multi_use_add_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -84,7 +84,7 @@ ; GCN-LABEL: {{^}}v_sad_u32_multi_use_max_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b store volatile i32 %t0, i32 *undef @@ -101,7 +101,7 @@ ; GCN-LABEL: {{^}}v_sad_u32_multi_use_min_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -119,7 +119,7 @@ ; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat2: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b store volatile i32 %sub0, i32 *undef @@ -136,7 +136,7 @@ ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b %sub1 = sub i32 %b, %a @@ -154,7 +154,7 @@ ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +define amdgpu_kernel void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { %icmp0 = icmp ugt <4 x i32> %a, %b %t0 = select <4 x i1> %icmp0, <4 x i32> %a, <4 x i32> %b @@ -173,7 +173,7 @@ ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +define amdgpu_kernel void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { %icmp0 = icmp ugt <4 x i32> %a, %b %sub0 = sub <4 x i32> %a, %b %sub1 = sub <4 x i32> %b, %a @@ -187,7 +187,7 @@ ; GCN-LABEL: {{^}}v_sad_u32_i16_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { +define amdgpu_kernel void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { %icmp0 = icmp ugt i16 %a, %b %t0 = select i1 %icmp0, i16 %a, i16 %b @@ -204,7 +204,7 @@ ; GCN-LABEL: {{^}}v_sad_u32_i16_pat2: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) { +define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) { %icmp0 = icmp ugt i16 %a, %b %sub0 = sub i16 %a, %b %sub1 = sub i16 %b, %a @@ -218,7 +218,7 @@ ; GCN-LABEL: {{^}}v_sad_u32_i8_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { +define amdgpu_kernel void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { %icmp0 = icmp ugt i8 %a, %b %t0 = select i1 %icmp0, i8 %a, i8 %b @@ -234,7 +234,7 @@ ; GCN-LABEL: {{^}}v_sad_u32_i8_pat2: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { +define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { %icmp0 = icmp ugt i8 %a, %b %sub0 = sub i8 %a, %b %sub1 = sub i8 %b, %a @@ -251,7 +251,7 @@ ; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -269,7 +269,7 @@ ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %d %sub1 = sub i32 %b, %a Index: test/CodeGen/AMDGPU/saddo.ll =================================================================== --- test/CodeGen/AMDGPU/saddo.ll +++ test/CodeGen/AMDGPU/saddo.ll @@ -6,7 +6,7 @@ declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone ; FUNC-LABEL: {{^}}saddo_i64_zext: -define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 %carry = extractvalue { i64, i1 } %sadd, 1 @@ -17,7 +17,7 @@ } ; FUNC-LABEL: {{^}}s_saddo_i32: -define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind %val = extractvalue { i32, i1 } %sadd, 0 %carry = extractvalue { i32, i1 } %sadd, 1 @@ -27,7 +27,7 @@ } ; FUNC-LABEL: {{^}}v_saddo_i32: -define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind @@ -39,7 +39,7 @@ } ; FUNC-LABEL: {{^}}s_saddo_i64: -define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 %carry = extractvalue { i64, i1 } %sadd, 1 @@ -51,7 +51,7 @@ ; FUNC-LABEL: {{^}}v_saddo_i64: ; SI: v_add_i32 ; SI: v_addc_u32 -define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %a = load i64, i64 addrspace(1)* %aptr, align 4 %b = load i64, i64 addrspace(1)* %bptr, align 4 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind Index: test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- test/CodeGen/AMDGPU/salu-to-valu.ll +++ test/CodeGen/AMDGPU/salu-to-valu.ll @@ -24,7 +24,7 @@ ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}} ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}} -define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = call i32 @llvm.amdgcn.workitem.id.y() @@ -65,7 +65,7 @@ ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]] ; GCN-NOHSA: buffer_store_dword [[V_OUT]] ; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]] -define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 { entry: %tmp = icmp ne i32 %a, 0 br i1 %tmp, label %if, label %else @@ -93,7 +93,7 @@ ; GCN-NOHSA-NOT: v_add ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}} ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 { +define amdgpu_kernel void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp, 4 @@ -113,7 +113,7 @@ ; GCN-NOHSA: buffer_store_dword ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} -define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = getelementptr i32, i32 addrspace(2)* %in, i32 %tmp @@ -133,7 +133,7 @@ ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN-NOHSA: buffer_store_dwordx2 ; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = getelementptr i64, i64 addrspace(2)* %in, i32 %tmp @@ -155,7 +155,7 @@ ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %in, i32 %tmp @@ -189,7 +189,7 @@ ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %in, i32 %tmp @@ -230,7 +230,7 @@ ; GCN-HSA: flat_load_dwordx4 ; GCN: s_endpgm -define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %in, i32 %tmp @@ -247,7 +247,7 @@ ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]] ; GCN-NOHSA: buffer_store_dword [[ADD]] ; GCN-HSA: flat_store_dword {{.*}}, [[ADD]] -define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 { +define amdgpu_kernel void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp, 4 @@ -261,7 +261,7 @@ ; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset: ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}} ; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}} -define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { +define amdgpu_kernel void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp, 4 @@ -275,7 +275,7 @@ ; GCN-NOHSA-NOT: v_add ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}} ; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}] -define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { +define amdgpu_kernel void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp, 4 @@ -290,7 +290,7 @@ ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +define amdgpu_kernel void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 @@ -313,7 +313,7 @@ ; GCN-NOHSA: buffer_store_dword ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +define amdgpu_kernel void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 @@ -350,7 +350,7 @@ ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +define amdgpu_kernel void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 @@ -385,7 +385,7 @@ ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +define amdgpu_kernel void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 @@ -441,7 +441,7 @@ ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]] ; GCN: {{^}}[[EXIT]]: ; GCN: s_endpgm -define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) { bb3: ; preds = %bb2 %tmp0 = bitcast i32 %cond to float %tmp1 = fadd float %tmp0, 2.500000e-01 @@ -459,7 +459,7 @@ ; GCN-LABEL: {{^}}phi_visit_order: ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 1, v{{[0-9]+}} -define void @phi_visit_order() { +define amdgpu_kernel void @phi_visit_order() { bb: br label %bb1 @@ -484,7 +484,7 @@ ; GCN: [[LOOP_LABEL:[0-9a-zA-Z_]+]]: ; GCN: s_xor_b32 [[B]], [[B]], [[A]] ; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]] -define void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) { +define amdgpu_kernel void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) { entry: br label %loop Index: test/CodeGen/AMDGPU/sampler-resource-id.ll =================================================================== --- test/CodeGen/AMDGPU/sampler-resource-id.ll +++ test/CodeGen/AMDGPU/sampler-resource-id.ll @@ -5,7 +5,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_0(i32 %in0, i32 addrspace(1)* %out) { +define amdgpu_kernel void @test_0(i32 %in0, i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in0) #0 store i32 %0, i32 addrspace(1)* %out @@ -17,7 +17,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) { +define amdgpu_kernel void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in1) #0 store i32 %0, i32 addrspace(1)* %out @@ -29,7 +29,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 2( -define void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) { +define amdgpu_kernel void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in2) #0 store i32 %0, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/scalar-store-cache-flush.mir =================================================================== --- test/CodeGen/AMDGPU/scalar-store-cache-flush.mir +++ test/CodeGen/AMDGPU/scalar-store-cache-flush.mir @@ -1,23 +1,23 @@ # RUN: llc -march=amdgcn -run-pass si-insert-waits %s -o - | FileCheck %s --- | - define void @basic_insert_dcache_wb() { + define amdgpu_kernel void @basic_insert_dcache_wb() { ret void } - define void @explicit_flush_after() { + define amdgpu_kernel void @explicit_flush_after() { ret void } - define void @explicit_flush_before() { + define amdgpu_kernel void @explicit_flush_before() { ret void } - define void @no_scalar_store() { + define amdgpu_kernel void @no_scalar_store() { ret void } - define void @multi_block_store() { + define amdgpu_kernel void @multi_block_store() { bb0: br i1 undef, label %bb1, label %bb2 @@ -28,7 +28,7 @@ ret void } - define void @one_block_store() { + define amdgpu_kernel void @one_block_store() { bb0: br i1 undef, label %bb1, label %bb2 Index: test/CodeGen/AMDGPU/scalar_to_vector.ll =================================================================== --- test/CodeGen/AMDGPU/scalar_to_vector.ll +++ test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -9,7 +9,7 @@ ; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]] ; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]] ; GCN: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}} -define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %tmp1 = load i32, i32 addrspace(1)* %in, align 4 %bc = bitcast i32 %tmp1 to <2 x i16> %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> @@ -21,7 +21,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]], ; GCN: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] ; GCN: buffer_store_dwordx2 -define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tmp1 = load float, float addrspace(1)* %in, align 4 %bc = bitcast float %tmp1 to <2 x i16> %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> @@ -33,7 +33,7 @@ ; to produce one, but for some reason never made it to selection. -; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +; define amdgpu_kernel void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { ; %tmp1 = load i32, i32 addrspace(1)* %in, align 4 ; %bc = bitcast i32 %tmp1 to <4 x i8> @@ -42,7 +42,7 @@ ; ret void ; } -; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind { +; define amdgpu_kernel void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind { ; %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0 ; %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1 ; %bc = bitcast <2 x i64> %newvec1 to <4 x i32> @@ -51,7 +51,7 @@ ; ret void ; } -; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind { +; define amdgpu_kernel void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind { ; %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0 ; %bc = bitcast <4 x i32> %newvec0 to <8 x i16> ; %add = add <8 x i16> %bc, @@ -59,7 +59,7 @@ ; ret void ; } -; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind { +; define amdgpu_kernel void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind { ; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 ; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> ; %add = add <4 x i16> %bc, @@ -67,7 +67,7 @@ ; ret void ; } -define void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind { +define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind { %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0 %bc = bitcast <4 x i8> %newvec0 to <2 x half> store <2 x half> %bc, <2 x half> addrspace(1)* %out Index: test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll +++ test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s ; REQUIRES: asserts -define void @main() #0 { +define amdgpu_kernel void @main() #0 { main_body: %tmp = load <4 x float>, <4 x float> addrspace(9)* null %tmp5 = extractelement <4 x float> %tmp, i32 3 Index: test/CodeGen/AMDGPU/schedule-global-loads.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-global-loads.ll +++ test/CodeGen/AMDGPU/schedule-global-loads.ll @@ -10,7 +10,7 @@ ; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 ; SI: buffer_store_dword [[REG0]] ; SI: buffer_store_dword [[REG1]] -define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 { %load0 = load i32, i32 addrspace(1)* %ptr, align 4 %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 2 %load1 = load i32, i32 addrspace(1)* %gep, align 4 @@ -24,7 +24,7 @@ ; FUNC-LABEL: {{^}}same_base_ptr_crash: ; SI: buffer_load_dword ; SI: buffer_load_dword -define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { +define amdgpu_kernel void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { entry: %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset %tmp0 = load i32, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/schedule-if-2.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-if-2.ll +++ test/CodeGen/AMDGPU/schedule-if-2.ll @@ -1,7 +1,7 @@ ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs ;REQUIRES: asserts -define void @main() { +define amdgpu_kernel void @main() { main_body: %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) %1 = extractelement <4 x float> %0, i32 0 Index: test/CodeGen/AMDGPU/schedule-if.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-if.ll +++ test/CodeGen/AMDGPU/schedule-if.ll @@ -1,7 +1,7 @@ ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs ;REQUIRES: asserts -define void @main() { +define amdgpu_kernel void @main() { main_body: %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) %1 = extractelement <4 x float> %0, i32 0 Index: test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll +++ test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll @@ -12,7 +12,7 @@ ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38 -define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { +define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { store i32 %x, i32 addrspace(1)* %out0, align 4 store i32 %y, i32 addrspace(1)* %out1, align 4 ret void @@ -26,7 +26,7 @@ ; GCN: s_load_dwordx2 ; GCN: s_load_dwordx2 ; GCN: s_endpgm -define void @same_base_ptr_crash(i64 addrspace(1)* %out, +define amdgpu_kernel void @same_base_ptr_crash(i64 addrspace(1)* %out, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, Index: test/CodeGen/AMDGPU/schedule-regpressure-limit.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-regpressure-limit.ll +++ test/CodeGen/AMDGPU/schedule-regpressure-limit.ll @@ -5,7 +5,7 @@ ; We expect a two digit VGPR usage here, not a three digit. ; CHECK: NumVgprs: {{[0-9][0-9]$}} -define void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) { +define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) { bb: %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1 %tmp2 = load float, float addrspace(3)* %tmp, align 4 Index: test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -10,7 +10,7 @@ ; VI: NumSgprs: {{[1-5][0-9]$}} ; VI: NumVgprs: {{[1-3][0-9]$}} -define void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) { +define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) { bb: %adr.a.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20004 %adr.b.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20252 Index: test/CodeGen/AMDGPU/scratch-buffer.ll =================================================================== --- test/CodeGen/AMDGPU/scratch-buffer.ll +++ test/CodeGen/AMDGPU/scratch-buffer.ll @@ -13,7 +13,7 @@ ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8004 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} -define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) { +define amdgpu_kernel void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) { entry: %scratch0 = alloca [8192 x i32] %scratch1 = alloca [8192 x i32] @@ -53,7 +53,7 @@ ; GCN-DAG: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]] ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} -define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { +define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { entry: %scratch0 = alloca [8192 x i32] %scratch1 = alloca [8192 x i32] @@ -88,7 +88,7 @@ ; GCN-LABEL: {{^}}neg_vaddr_offset: ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16{{$}} -define void @neg_vaddr_offset(i32 %offset) { +define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) { entry: %array = alloca [8192 x i32] %ptr_offset = add i32 %offset, 4 @@ -99,7 +99,7 @@ ; GCN-LABEL: {{^}}pos_vaddr_offset: ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:20 -define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) { +define amdgpu_kernel void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) { entry: %array = alloca [8192 x i32] %ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 4 Index: test/CodeGen/AMDGPU/sdiv.ll =================================================================== --- test/CodeGen/AMDGPU/sdiv.ll +++ test/CodeGen/AMDGPU/sdiv.ll @@ -13,7 +13,7 @@ ; FUNC-LABEL: {{^}}sdiv_i32: ; EG: CF_END -define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in %den = load i32, i32 addrspace(1) * %den_ptr @@ -23,7 +23,7 @@ } ; FUNC-LABEL: {{^}}sdiv_i32_4: -define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %num = load i32, i32 addrspace(1) * %in %result = sdiv i32 %num, 4 store i32 %result, i32 addrspace(1)* %out @@ -43,14 +43,14 @@ ; SI: v_add_i32 ; SI: buffer_store_dword ; SI: s_endpgm -define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %num = load i32, i32 addrspace(1) * %in %result = sdiv i32 %num, 3435 store i32 %result, i32 addrspace(1)* %out ret void } -define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %num = load <2 x i32>, <2 x i32> addrspace(1) * %in %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr @@ -59,14 +59,14 @@ ret void } -define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %num = load <2 x i32>, <2 x i32> addrspace(1) * %in %result = sdiv <2 x i32> %num, store <2 x i32> %result, <2 x i32> addrspace(1)* %out ret void } -define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %num = load <4 x i32>, <4 x i32> addrspace(1) * %in %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr @@ -75,7 +75,7 @@ ret void } -define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %num = load <4 x i32>, <4 x i32> addrspace(1) * %in %result = sdiv <4 x i32> %num, store <4 x i32> %result, <4 x i32> addrspace(1)* %out @@ -86,7 +86,7 @@ ; SI: v_rcp_f32 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 8 ; SI: buffer_store_dword [[BFE]] -define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 %num = load i8, i8 addrspace(1) * %in %den = load i8, i8 addrspace(1) * %den_ptr @@ -100,7 +100,7 @@ ; SI: v_rcp_f32 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 23 ; SI: buffer_store_dword [[BFE]] -define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { +define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1 %num = load i23, i23 addrspace(1) * %in %den = load i23, i23 addrspace(1) * %den_ptr @@ -114,7 +114,7 @@ ; SI: v_rcp_f32 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 24 ; SI: buffer_store_dword [[BFE]] -define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { +define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1 %num = load i24, i24 addrspace(1) * %in %den = load i24, i24 addrspace(1) * %den_ptr @@ -126,7 +126,7 @@ ; FUNC-LABEL: {{^}}v_sdiv_i25: ; SI-NOT: v_rcp_f32 -define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) { +define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) { %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1 %num = load i25, i25 addrspace(1) * %in %den = load i25, i25 addrspace(1) * %den_ptr @@ -137,19 +137,19 @@ } ; Tests for 64-bit divide bypass. -; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +; define amdgpu_kernel void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; %result = sdiv i64 %a, %b ; store i64 %result, i64 addrspace(1)* %out, align 8 ; ret void ; } -; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +; define amdgpu_kernel void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; %result = srem i64 %a, %b ; store i64 %result, i64 addrspace(1)* %out, align 8 ; ret void ; } -; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +; define amdgpu_kernel void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; %resultdiv = sdiv i64 %a, %b ; %resultrem = srem i64 %a, %b ; %result = add i64 %resultdiv, %resultrem @@ -163,7 +163,7 @@ ; SI: v_mul_hi_i32 ; SI: v_mul_hi_i32 -define void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { +define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 %2 = sdiv <4 x i32> %1, store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16 Index: test/CodeGen/AMDGPU/sdivrem24.ll =================================================================== --- test/CodeGen/AMDGPU/sdivrem24.ll +++ test/CodeGen/AMDGPU/sdivrem24.ll @@ -12,7 +12,7 @@ ; EG-DAG: INT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_INT -define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 %num = load i8, i8 addrspace(1) * %in %den = load i8, i8 addrspace(1) * %den_ptr @@ -31,7 +31,7 @@ ; EG-DAG: INT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_INT -define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %num = load i16, i16 addrspace(1) * %in, align 2 %den = load i16, i16 addrspace(1) * %den_ptr, align 2 @@ -50,7 +50,7 @@ ; EG-DAG: INT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_INT -define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -69,7 +69,7 @@ ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -88,7 +88,7 @@ ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -107,7 +107,7 @@ ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -130,7 +130,7 @@ ; EG-DAG: INT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_INT -define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 %num = load i8, i8 addrspace(1) * %in %den = load i8, i8 addrspace(1) * %den_ptr @@ -149,7 +149,7 @@ ; EG-DAG: INT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_INT -define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %num = load i16, i16 addrspace(1) * %in, align 2 %den = load i16, i16 addrspace(1) * %den_ptr, align 2 @@ -168,7 +168,7 @@ ; EG-DAG: INT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_INT -define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -187,7 +187,7 @@ ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -206,7 +206,7 @@ ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -225,7 +225,7 @@ ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -244,7 +244,7 @@ ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -263,7 +263,7 @@ ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -283,7 +283,7 @@ ; EG: INT_TO_FLT ; EG: RECIP_IEEE -define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -303,7 +303,7 @@ ; EG: INT_TO_FLT ; EG: RECIP_IEEE -define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -323,7 +323,7 @@ ; EG: INT_TO_FLT ; EG: RECIP_IEEE -define void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 Index: test/CodeGen/AMDGPU/sdivrem64.ll =================================================================== --- test/CodeGen/AMDGPU/sdivrem64.ll +++ test/CodeGen/AMDGPU/sdivrem64.ll @@ -70,7 +70,7 @@ ; SI-NOT: v_lshr_b64 ; VI-NOT: v_lshrrev_b64 ; GCN: s_endpgm -define void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = sdiv i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -144,7 +144,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -159,7 +159,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = ashr i64 %x, 33 %2 = ashr i64 %y, 33 %result = sdiv i64 %1, %2 @@ -176,7 +176,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = ashr i64 %x, 33 %2 = ashr i64 %y, 33 %result = srem i64 %1, %2 @@ -196,7 +196,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 %result = sdiv i64 %1, %2 @@ -216,7 +216,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 %result = srem i64 %1, %2 Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- test/CodeGen/AMDGPU/sdwa-peephole.ll +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -8,7 +8,7 @@ ; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -define void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %a = load i32, i32 addrspace(1)* %in, align 4 %shr = lshr i32 %a, 16 %add = add i32 %a, %shr @@ -23,7 +23,7 @@ ; SDWA: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -define void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %a = load i32, i32 addrspace(1)* %in, align 4 %shr = lshr i32 %a, 16 %sub = sub i32 %shr, %a @@ -39,7 +39,7 @@ ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -define void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) { +define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) { %a = load i32, i32 addrspace(1)* %in1, align 4 %b = load i32, i32 addrspace(1)* %in2, align 4 %shra = lshr i32 %a, 16 @@ -55,7 +55,7 @@ ; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SDWA-NOT: v_mul_u32_u24_sdwa -define void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) { +define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) { entry: %a = load i16, i16 addrspace(1)* %ina, align 4 %b = load i16, i16 addrspace(1)* %inb, align 4 @@ -75,7 +75,7 @@ ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL]], v{{[0-9]+}} -define void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { entry: %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 @@ -97,7 +97,7 @@ ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}} ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}} -define void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) { entry: %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4 %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4 @@ -123,7 +123,7 @@ ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL3]], v{{[0-9]+}} ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL2]], v{{[0-9]+}} -define void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) { entry: %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4 %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4 @@ -138,7 +138,7 @@ ; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SDWA-NOT: v_mul_f16_sdwa -define void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) { +define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) { entry: %a = load half, half addrspace(1)* %ina, align 4 %b = load half, half addrspace(1)* %inb, align 4 @@ -157,7 +157,7 @@ ; SDWA: v_mul_f16_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -define void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { entry: %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4 @@ -178,7 +178,7 @@ ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) { entry: %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4 %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4 @@ -204,7 +204,7 @@ ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) { entry: %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4 %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4 @@ -219,7 +219,7 @@ ; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SDWA-NOT: v_mul_u32_u24_sdwa -define void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) { +define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) { entry: %a = load i8, i8 addrspace(1)* %ina, align 4 %b = load i8, i8 addrspace(1)* %inb, align 4 @@ -238,7 +238,7 @@ ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -define void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) { entry: %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4 %b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4 @@ -259,7 +259,7 @@ ; SDWA: v_mul_u32_u24_sdwa ; SDWA: v_mul_u32_u24_sdwa -define void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) { entry: %a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4 %b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4 @@ -283,7 +283,7 @@ ; SDWA: v_mul_u32_u24_sdwa ; SDWA: v_mul_u32_u24_sdwa -define void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) { entry: %a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4 %b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4 @@ -304,7 +304,7 @@ ; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] -define void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { +define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { entry: %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4 @@ -318,7 +318,7 @@ ; NOSDWA-NOT: v_mul_u32_u24_sdwa ; SDWA-NOT: v_mul_u32_u24_sdwa -define void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { entry: %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 %mul = mul <2 x i16> %a, @@ -337,7 +337,7 @@ ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -define void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { +define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { entry: %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 @@ -353,7 +353,7 @@ ; SDWA-NOT: v_mul_u32_u24_sdwa ; SDWA-NOT: v_add_i32_sdwa -define void @mul_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb, i1 addrspace(1)* %incond) { +define amdgpu_kernel void @mul_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb, i1 addrspace(1)* %incond) { entry: %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 Index: test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll =================================================================== --- test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll +++ test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll @@ -11,7 +11,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]] ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]] -define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 { +define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -29,7 +29,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]] ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]] -define void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 { +define amdgpu_kernel void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %mul = call float @llvm.amdgcn.fmul.legacy(float %x, float 4.0) Index: test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll =================================================================== --- test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll +++ test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll @@ -8,7 +8,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]] -define void @add_select_fabs_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -30,7 +30,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]] ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[W]] -define void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -57,7 +57,7 @@ ; GCN: buffer_store_dword [[ADD]] ; GCN: buffer_store_dword [[X_ABS]] -define void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -80,7 +80,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]] ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[Y]]|, [[W]] -define void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -104,7 +104,7 @@ ; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] -define void @add_select_fabs_var_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -123,7 +123,7 @@ ; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] -define void @add_select_fabs_negk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -140,7 +140,7 @@ ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[X]] -define void @add_select_fabs_negk_negk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, float -2.0, float -1.0 @@ -155,7 +155,7 @@ ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] -define void @add_select_posk_posk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, float 2.0, float 1.0 @@ -172,7 +172,7 @@ ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] -define void @add_select_negk_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -192,7 +192,7 @@ ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] -define void @add_select_negliteralk_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -209,7 +209,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]] -define void @add_select_fabs_posk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_posk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -228,7 +228,7 @@ ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]] -define void @add_select_posk_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -246,7 +246,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] -define void @add_select_fneg_fneg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -268,7 +268,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]] -define void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -295,7 +295,7 @@ ; GCN: buffer_store_dword [[ADD]] ; GCN: buffer_store_dword [[NEG_X]] -define void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -318,7 +318,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]] -define void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -342,7 +342,7 @@ ; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] -define void @add_select_fneg_var_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -360,7 +360,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] -define void @add_select_fneg_negk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -378,7 +378,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] -define void @add_select_fneg_inv2pi_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -398,7 +398,7 @@ ; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] -define void @add_select_fneg_neginv2pi_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -415,7 +415,7 @@ ; GCN: v_cmp_eq_u32_e64 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] -define void @add_select_negk_negk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, float -2.0, float -1.0 @@ -432,7 +432,7 @@ ; GCN: v_cmp_eq_u32_e64 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] -define void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, float -2048.0, float -4096.0 @@ -446,7 +446,7 @@ ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]] -define void @add_select_fneg_negk_negk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, float -2.0, float -1.0 @@ -463,7 +463,7 @@ ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] -define void @add_select_negk_fneg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -480,7 +480,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] -define void @add_select_fneg_posk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -498,7 +498,7 @@ ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] -define void @add_select_posk_fneg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -518,7 +518,7 @@ ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] -define void @add_select_negfabs_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -541,7 +541,7 @@ ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] -define void @add_select_fabs_negfabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -564,7 +564,7 @@ ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] -define void @add_select_neg_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -586,7 +586,7 @@ ; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] -define void @add_select_fabs_neg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -607,7 +607,7 @@ ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] -define void @add_select_neg_negfabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -629,7 +629,7 @@ ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] -define void @add_select_negfabs_neg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -651,7 +651,7 @@ ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]] -define void @mul_select_negfabs_posk_f32(i32 %c) #0 { +define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -672,7 +672,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]] -define void @mul_select_posk_negfabs_f32(i32 %c) #0 { +define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -690,7 +690,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]] -define void @mul_select_negfabs_negk_f32(i32 %c) #0 { +define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -709,7 +709,7 @@ ; GCN: v_cmp_ne_u32_e64 vcc ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]] -define void @mul_select_negk_negfabs_f32(i32 %c) #0 { +define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -732,7 +732,7 @@ ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], -4.0, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc ; GCN-NEXT: buffer_store_dword [[SELECT]] -define void @select_fneg_posk_src_add_f32(i32 %c) #0 { +define amdgpu_kernel void @select_fneg_posk_src_add_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -749,7 +749,7 @@ ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], 4.0, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc ; GCN-NEXT: buffer_store_dword [[SELECT]] -define void @select_fneg_posk_src_sub_f32(i32 %c) #0 { +define amdgpu_kernel void @select_fneg_posk_src_sub_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %add = fsub float %x, 4.0 @@ -765,7 +765,7 @@ ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc ; GCN-NEXT: buffer_store_dword [[SELECT]] -define void @select_fneg_posk_src_mul_f32(i32 %c) #0 { +define amdgpu_kernel void @select_fneg_posk_src_mul_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %mul = fmul float %x, 4.0 @@ -782,7 +782,7 @@ ; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[X]], -4.0, -[[Z]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[FMA]], vcc ; GCN-NEXT: buffer_store_dword [[SELECT]] -define void @select_fneg_posk_src_fma_f32(i32 %c) #0 { +define amdgpu_kernel void @select_fneg_posk_src_fma_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -799,7 +799,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[X]], vcc ; GCN-NEXT: buffer_store_dword [[SELECT]] -define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 { +define amdgpu_kernel void @select_fneg_posk_src_fmad_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -818,7 +818,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]] ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]] -define void @select_fneg_posk_src_rcp_f32(i32 %c) #0 { +define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 Index: test/CodeGen/AMDGPU/select-i1.ll =================================================================== --- test/CodeGen/AMDGPU/select-i1.ll +++ test/CodeGen/AMDGPU/select-i1.ll @@ -6,7 +6,7 @@ ; FUNC-LABEL: {{^}}select_i1: ; SI: v_cndmask_b32 ; SI-NOT: v_cndmask_b32 -define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind { +define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind { %cmp = icmp ugt i32 %cond, 5 %sel = select i1 %cmp, i1 %a, i1 %b store i1 %sel, i1 addrspace(1)* %out, align 4 @@ -19,7 +19,7 @@ ; SI-DAG: buffer_load_ubyte [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 ; SI: v_cmp_eq_u32_e32 vcc, 1, [[COND]] ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] -define void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind { +define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind { %cmp = icmp slt i1 %cond, false %sel = select i1 %cmp, i1 %a, i1 %b store i1 %sel, i1 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/select-opt.ll =================================================================== --- test/CodeGen/AMDGPU/select-opt.ll +++ test/CodeGen/AMDGPU/select-opt.ll @@ -11,7 +11,7 @@ ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 { %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c %and = and i1 %icmp0, %icmp1 @@ -27,7 +27,7 @@ ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 { %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c %and = and i1 %fcmp0, %fcmp1 @@ -43,7 +43,7 @@ ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}} -define void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 { +define amdgpu_kernel void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 { %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c %and = and i1 %icmp0, %icmp1 @@ -59,7 +59,7 @@ ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}} -define void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 { +define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 { %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c %and = and i1 %fcmp0, %fcmp1 @@ -76,7 +76,7 @@ ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] ; GCN: s_endpgm -define void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 { %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c %or = or i1 %icmp0, %icmp1 @@ -92,7 +92,7 @@ ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 { %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c %or = or i1 %fcmp0, %fcmp1 @@ -108,7 +108,7 @@ ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}} -define void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 { +define amdgpu_kernel void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 { %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c %or = or i1 %icmp0, %icmp1 @@ -124,7 +124,7 @@ ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}} -define void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 { +define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 { %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c %or = or i1 %fcmp0, %fcmp1 @@ -138,7 +138,7 @@ ; GCN: v_cmp_neq_f32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} -define void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 { +define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 { entry: %cmp0 = fcmp oeq float %c0, 1.0 br i1 %cmp0, label %if0, label %endif Index: test/CodeGen/AMDGPU/select-vectors.ll =================================================================== --- test/CodeGen/AMDGPU/select-vectors.ll +++ test/CodeGen/AMDGPU/select-vectors.ll @@ -10,7 +10,7 @@ ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind { +define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind { %cmp = icmp eq i8 %c, 0 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 @@ -22,7 +22,7 @@ ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind { +define amdgpu_kernel void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4 @@ -36,7 +36,7 @@ ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: buffer_store_dwordx2 -define void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind { +define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8 @@ -49,7 +49,7 @@ ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: buffer_store_dwordx4 -define void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind { +define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16 @@ -64,7 +64,7 @@ ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; SI: buffer_store_dwordx4 -define void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 { +define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 { bb: %tmp2 = icmp ult i32 %cond, 32 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in @@ -82,7 +82,7 @@ ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind { +define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16 @@ -102,7 +102,7 @@ ; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] ; SI: v_cndmask_b32_e32 ; SI: buffer_store_dwordx2 -define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind { +define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x float> %a, <2 x float> %b store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16 @@ -120,7 +120,7 @@ ; SI: v_cndmask_b32_e32 ; SI: buffer_store_dwordx4 -define void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind { +define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x float> %a, <4 x float> %b store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16 @@ -135,7 +135,7 @@ ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; SI: buffer_store_dwordx4 -define void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 { +define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 { bb: %tmp2 = icmp ult i32 %cond, 32 %val = load <4 x float>, <4 x float> addrspace(1)* %in @@ -153,7 +153,7 @@ ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind { +define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x float> %a, <8 x float> %b store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16 @@ -165,7 +165,7 @@ ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind { +define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16 @@ -181,7 +181,7 @@ ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind { +define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16 @@ -205,7 +205,7 @@ ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind { +define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16 Index: test/CodeGen/AMDGPU/select.f16.ll =================================================================== --- test/CodeGen/AMDGPU/select.f16.ll +++ test/CodeGen/AMDGPU/select.f16.ll @@ -17,7 +17,7 @@ ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @select_f16( +define amdgpu_kernel void @select_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -48,7 +48,7 @@ ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @select_f16_imm_a( +define amdgpu_kernel void @select_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b, half addrspace(1)* %c, @@ -78,7 +78,7 @@ ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @select_f16_imm_b( +define amdgpu_kernel void @select_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %c, @@ -109,7 +109,7 @@ ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @select_f16_imm_c( +define amdgpu_kernel void @select_f16_imm_c( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -139,7 +139,7 @@ ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @select_f16_imm_d( +define amdgpu_kernel void @select_f16_imm_d( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -168,7 +168,7 @@ ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm -define void @select_v2f16( +define amdgpu_kernel void @select_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -202,7 +202,7 @@ ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm -define void @select_v2f16_imm_a( +define amdgpu_kernel void @select_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b, <2 x half> addrspace(1)* %c, @@ -235,7 +235,7 @@ ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm -define void @select_v2f16_imm_b( +define amdgpu_kernel void @select_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %c, @@ -272,7 +272,7 @@ ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm -define void @select_v2f16_imm_c( +define amdgpu_kernel void @select_v2f16_imm_c( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -304,7 +304,7 @@ ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm -define void @select_v2f16_imm_d( +define amdgpu_kernel void @select_v2f16_imm_d( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, Index: test/CodeGen/AMDGPU/select.ll =================================================================== --- test/CodeGen/AMDGPU/select.ll +++ test/CodeGen/AMDGPU/select.ll @@ -14,7 +14,7 @@ ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW -define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out, +define amdgpu_kernel void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out, <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out, <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out, i32 %cond) { Index: test/CodeGen/AMDGPU/select64.ll =================================================================== --- test/CodeGen/AMDGPU/select64.ll +++ test/CodeGen/AMDGPU/select64.ll @@ -7,7 +7,7 @@ ; CHECK-NOT: s_lshr_b64 ; CHECK: v_cndmask ; CHECK: v_cndmask -define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) { +define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) { entry: %0 = icmp ugt i32 %cond, 5 %1 = select i1 %0, i64 0, i64 %in @@ -18,7 +18,7 @@ ; CHECK-LABEL: {{^}}select_trunc_i64: ; CHECK: v_cndmask_b32 ; CHECK-NOT: v_cndmask_b32 -define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind { +define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind { %cmp = icmp ugt i32 %cond, 5 %sel = select i1 %cmp, i64 0, i64 %in %trunc = trunc i64 %sel to i32 @@ -29,7 +29,7 @@ ; CHECK-LABEL: {{^}}select_trunc_i64_2: ; CHECK: v_cndmask_b32 ; CHECK-NOT: v_cndmask_b32 -define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind { %cmp = icmp ugt i32 %cond, 5 %sel = select i1 %cmp, i64 %a, i64 %b %trunc = trunc i64 %sel to i32 @@ -40,7 +40,7 @@ ; CHECK-LABEL: {{^}}v_select_trunc_i64_2: ; CHECK: v_cndmask_b32 ; CHECK-NOT: v_cndmask_b32 -define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %cmp = icmp ugt i32 %cond, 5 %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 @@ -54,7 +54,7 @@ ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}} ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}} ; CHECK: s_endpgm -define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %cmp = icmp ugt i32 %cond, 5 %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 Index: test/CodeGen/AMDGPU/selectcc-cnd.ll =================================================================== --- test/CodeGen/AMDGPU/selectcc-cnd.ll +++ test/CodeGen/AMDGPU/selectcc-cnd.ll @@ -3,7 +3,7 @@ ;CHECK-NOT: SETE ;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x, ;CHECK: 1073741824 -define void @test(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @test(float addrspace(1)* %out, float addrspace(1)* %in) { %1 = load float, float addrspace(1)* %in %2 = fcmp oeq float %1, 0.0 %3 = select i1 %2, float 1.0, float 2.0 Index: test/CodeGen/AMDGPU/selectcc-cnde-int.ll =================================================================== --- test/CodeGen/AMDGPU/selectcc-cnde-int.ll +++ test/CodeGen/AMDGPU/selectcc-cnde-int.ll @@ -3,7 +3,7 @@ ;CHECK-NOT: SETE_INT ;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x, ;CHECK-NEXT: 2 -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %1 = load i32, i32 addrspace(1)* %in %2 = icmp eq i32 %1, 0 %3 = select i1 %2, i32 1, i32 2 Index: test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll =================================================================== --- test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll +++ test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll @@ -6,7 +6,7 @@ ; CHECK-NEXT: -1 ; Test a selectcc with i32 LHS/RHS and float True/False -define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = load i32, i32 addrspace(1)* %in %1 = icmp sge i32 %0, 0 Index: test/CodeGen/AMDGPU/selectcc-opt.ll =================================================================== --- test/CodeGen/AMDGPU/selectcc-opt.ll +++ test/CodeGen/AMDGPU/selectcc-opt.ll @@ -7,7 +7,7 @@ ; EG-NOT: CND ; EG: SET{{[NEQGTL]+}}_DX10 -define void @test_a(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @test_a(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 0.000000e+00 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -35,7 +35,7 @@ ; EG: SET{{[GTEQN]+}}_DX10 ; EG-NEXT: PRED_ ; EG-NEXT: ALU clause starting -define void @test_b(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @test_b(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 0.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -59,7 +59,7 @@ ; Test a CND*_INT instruction with float true/false values ; EG-LABEL: {{^}}test_c: ; EG: CND{{[GTE]+}}_INT -define void @test_c(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @test_c(float addrspace(1)* %out, i32 %in) { entry: %0 = icmp sgt i32 %in, 0 %1 = select i1 %0, float 2.0, float 3.0 @@ -72,7 +72,7 @@ ; SI-NEXT: v_cndmask_b32_e64 ; SI-NOT: cmp ; SI-NOT: cndmask -define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = select i1 %icmp0, i32 -1, i32 0 store i32 %ext, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/selectcc.ll =================================================================== --- test/CodeGen/AMDGPU/selectcc.ll +++ test/CodeGen/AMDGPU/selectcc.ll @@ -11,7 +11,7 @@ ; SI: v_cmp_eq_u64 ; SI: v_cndmask ; SI: v_cndmask -define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) { +define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) { entry: %0 = icmp eq i64 %lhs, %rhs %1 = select i1 %0, i64 %true, i64 %false Index: test/CodeGen/AMDGPU/set-dx10.ll =================================================================== --- test/CodeGen/AMDGPU/set-dx10.ll +++ test/CodeGen/AMDGPU/set-dx10.ll @@ -8,7 +8,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp une float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -22,7 +22,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp une float %in, 5.0 %1 = select i1 %0, i32 -1, i32 0 @@ -34,7 +34,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp oeq float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -48,7 +48,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp oeq float %in, 5.0 %1 = select i1 %0, i32 -1, i32 0 @@ -60,7 +60,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp ogt float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -74,7 +74,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp ogt float %in, 5.0 %1 = select i1 %0, i32 -1, i32 0 @@ -86,7 +86,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp oge float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -100,7 +100,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp oge float %in, 5.0 %1 = select i1 %0, i32 -1, i32 0 @@ -112,7 +112,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp ole float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -126,7 +126,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp ole float %in, 5.0 %1 = select i1 %0, i32 -1, i32 0 @@ -138,7 +138,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -152,7 +152,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 5.0 %1 = select i1 %0, i32 -1, i32 0 Index: test/CodeGen/AMDGPU/setcc-equivalent.ll =================================================================== --- test/CodeGen/AMDGPU/setcc-equivalent.ll +++ test/CodeGen/AMDGPU/setcc-equivalent.ll @@ -3,7 +3,7 @@ ; EG-LABEL: {{^}}and_setcc_setcc_i32: ; EG: AND_INT ; EG-NEXT: SETE_INT -define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %cmp1 = icmp eq i32 %a, -1 %cmp2 = icmp eq i32 %b, -1 %and = and i1 %cmp1, %cmp2 @@ -20,7 +20,7 @@ ; EG: SETE_INT ; EG: AND_INT ; EG: SETE_INT -define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) { +define amdgpu_kernel void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) { %cmp1 = icmp eq <4 x i32> %a, %cmp2 = icmp eq <4 x i32> %b, %and = and <4 x i1> %cmp1, %cmp2 Index: test/CodeGen/AMDGPU/setcc-fneg-constant.ll =================================================================== --- test/CodeGen/AMDGPU/setcc-fneg-constant.ll +++ test/CodeGen/AMDGPU/setcc-fneg-constant.ll @@ -10,7 +10,7 @@ ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]] ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]] ; GCN: buffer_store_dword [[MUL]] -define void @multi_use_fneg_src() #0 { +define amdgpu_kernel void @multi_use_fneg_src() #0 { %a = load volatile float, float addrspace(1)* undef %b = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef @@ -33,7 +33,7 @@ ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]] ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[A]] ; GCN: v_mul_f32_e64 [[USE1:v[0-9]+]], [[MUL]], -[[MUL]] -define void @multi_foldable_use_fneg_src() #0 { +define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 { %a = load volatile float, float addrspace(1)* undef %b = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef @@ -59,7 +59,7 @@ ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 4.0, [[MUL]] ; GCN-NOT: xor ; GCN: buffer_store_dword [[MUL]] -define void @multi_use_fneg() #0 { +define amdgpu_kernel void @multi_use_fneg() #0 { %a = load volatile float, float addrspace(1)* undef %b = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef @@ -82,7 +82,7 @@ ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL0]] ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[MUL0]], [[MUL0]] ; GCN: buffer_store_dword [[MUL1]] -define void @multi_foldable_use_fneg() #0 { +define amdgpu_kernel void @multi_foldable_use_fneg() #0 { %a = load volatile float, float addrspace(1)* undef %b = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef @@ -101,7 +101,7 @@ ; GCN-LABEL: {{^}}test_setcc_fneg_oeq_posk_f32: ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, v{{[0-9]+}} -define void @test_setcc_fneg_oeq_posk_f32() #0 { +define amdgpu_kernel void @test_setcc_fneg_oeq_posk_f32() #0 { %a = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef %y = load volatile i32, i32 addrspace(1)* undef @@ -114,7 +114,7 @@ ; GCN-LABEL: {{^}}test_setcc_fneg_ogt_posk_f32: ; GCN: v_cmp_gt_f32_e32 vcc, -4.0, v{{[0-9]+}} -define void @test_setcc_fneg_ogt_posk_f32() #0 { +define amdgpu_kernel void @test_setcc_fneg_ogt_posk_f32() #0 { %a = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef %y = load volatile i32, i32 addrspace(1)* undef @@ -127,7 +127,7 @@ ; GCN-LABEL: {{^}}test_setcc_fneg_oge_posk_f32: ; GCN: v_cmp_ge_f32_e32 vcc, -4.0, v{{[0-9]+}} -define void @test_setcc_fneg_oge_posk_f32() #0 { +define amdgpu_kernel void @test_setcc_fneg_oge_posk_f32() #0 { %a = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef %y = load volatile i32, i32 addrspace(1)* undef @@ -140,7 +140,7 @@ ; GCN-LABEL: {{^}}test_setcc_fneg_olt_posk_f32: ; GCN: v_cmp_lt_f32_e32 vcc, -4.0, v{{[0-9]+}} -define void @test_setcc_fneg_olt_posk_f32() #0 { +define amdgpu_kernel void @test_setcc_fneg_olt_posk_f32() #0 { %a = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef %y = load volatile i32, i32 addrspace(1)* undef @@ -153,7 +153,7 @@ ; GCN-LABEL: {{^}}test_setcc_fneg_ole_posk_f32: ; GCN: v_cmp_le_f32_e32 vcc, -4.0, v{{[0-9]+}} -define void @test_setcc_fneg_ole_posk_f32() #0 { +define amdgpu_kernel void @test_setcc_fneg_ole_posk_f32() #0 { %a = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef %y = load volatile i32, i32 addrspace(1)* undef @@ -166,7 +166,7 @@ ; GCN-LABEL: {{^}}test_setcc_fneg_one_posk_f32: ; GCN: v_cmp_lg_f32_e32 vcc, -4.0, v{{[0-9]+}} -define void @test_setcc_fneg_one_posk_f32() #0 { +define amdgpu_kernel void @test_setcc_fneg_one_posk_f32() #0 { %a = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef %y = load volatile i32, i32 addrspace(1)* undef @@ -179,7 +179,7 @@ ; GCN-LABEL: {{^}}test_setcc_fneg_ueq_posk_f32: ; GCN: v_cmp_nlg_f32_e32 vcc, -4.0, v{{[0-9]+}} -define void @test_setcc_fneg_ueq_posk_f32() #0 { +define amdgpu_kernel void @test_setcc_fneg_ueq_posk_f32() #0 { %a = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef %y = load volatile i32, i32 addrspace(1)* undef @@ -192,7 +192,7 @@ ; GCN-LABEL: {{^}}test_setcc_fneg_ugt_posk_f32: ; GCN: v_cmp_nle_f32_e32 vcc, -4.0, v{{[0-9]+}} -define void @test_setcc_fneg_ugt_posk_f32() #0 { +define amdgpu_kernel void @test_setcc_fneg_ugt_posk_f32() #0 { %a = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef %y = load volatile i32, i32 addrspace(1)* undef @@ -205,7 +205,7 @@ ; GCN-LABEL: {{^}}test_setcc_fneg_uge_posk_f32: ; GCN: v_cmp_nlt_f32_e32 vcc, -4.0, v{{[0-9]+}} -define void @test_setcc_fneg_uge_posk_f32() #0 { +define amdgpu_kernel void @test_setcc_fneg_uge_posk_f32() #0 { %a = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef %y = load volatile i32, i32 addrspace(1)* undef @@ -218,7 +218,7 @@ ; GCN-LABEL: {{^}}test_setcc_fneg_ult_posk_f32: ; GCN: v_cmp_nge_f32_e32 vcc, -4.0, v{{[0-9]+}} -define void @test_setcc_fneg_ult_posk_f32() #0 { +define amdgpu_kernel void @test_setcc_fneg_ult_posk_f32() #0 { %a = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef %y = load volatile i32, i32 addrspace(1)* undef @@ -231,7 +231,7 @@ ; GCN-LABEL: {{^}}test_setcc_fneg_ule_posk_f32: ; GCN: v_cmp_ngt_f32_e32 vcc, -4.0, v{{[0-9]+}} -define void @test_setcc_fneg_ule_posk_f32() #0 { +define amdgpu_kernel void @test_setcc_fneg_ule_posk_f32() #0 { %a = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef %y = load volatile i32, i32 addrspace(1)* undef @@ -244,7 +244,7 @@ ; GCN-LABEL: {{^}}test_setcc_fneg_une_posk_f32: ; GCN: v_cmp_neq_f32_e32 vcc, -4.0, v{{[0-9]+}} -define void @test_setcc_fneg_une_posk_f32() #0 { +define amdgpu_kernel void @test_setcc_fneg_une_posk_f32() #0 { %a = load volatile float, float addrspace(1)* undef %x = load volatile i32, i32 addrspace(1)* undef %y = load volatile i32, i32 addrspace(1)* undef Index: test/CodeGen/AMDGPU/setcc-opt.ll =================================================================== --- test/CodeGen/AMDGPU/setcc-opt.ll +++ test/CodeGen/AMDGPU/setcc-opt.ll @@ -11,7 +11,7 @@ ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1 -define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, 0 @@ -28,7 +28,7 @@ ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1 -define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, 0 @@ -42,7 +42,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, -1 @@ -56,7 +56,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, -1 @@ -70,7 +70,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, 0 @@ -84,7 +84,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, 0 @@ -98,7 +98,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, 1 @@ -111,7 +111,7 @@ ; GCN: v_cmp_eq_u32_e32 vcc, ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] -define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, 1 @@ -124,7 +124,7 @@ ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0{{$}} ; GCN: buffer_store_byte [[TMP]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, -1 @@ -137,7 +137,7 @@ ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 1{{$}} ; GCN: buffer_store_byte [[TMP]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, -1 @@ -159,7 +159,7 @@ ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm -define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind { +define amdgpu_kernel void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind { %b.ext = zext i8 %b to i32 %icmp0 = icmp ne i32 %b.ext, 255 store i1 %icmp0, i1 addrspace(1)* %out @@ -172,7 +172,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm -define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind { +define amdgpu_kernel void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind { %b = load i8, i8 addrspace(1)* %b.ptr %b.ext = sext i8 %b to i32 %icmp0 = icmp ne i32 %b.ext, -1 @@ -186,7 +186,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]] ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN: s_endpgm -define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind { +define amdgpu_kernel void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind { %b.ext = sext i8 %b to i32 %icmp0 = icmp ne i32 %b.ext, -1 store i1 %icmp0, i1 addrspace(1)* %out @@ -207,7 +207,7 @@ ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm -define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind { +define amdgpu_kernel void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind { %b.ext = sext i8 %b to i32 %icmp0 = icmp ne i32 %b.ext, -1 store i1 %icmp0, i1 addrspace(1)* %out @@ -218,7 +218,7 @@ ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} ; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm -define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind { +define amdgpu_kernel void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind { %b.ext = zext i8 %b to i32 %icmp0 = icmp ne i32 %b.ext, -1 store i1 %icmp0, i1 addrspace(1)* %out @@ -229,7 +229,7 @@ ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} ; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, 2 @@ -241,7 +241,7 @@ ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, 2 @@ -256,7 +256,7 @@ ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}} ; GCN: buffer_store_byte [[K]] -define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, 1 @@ -267,7 +267,7 @@ ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}} ; GCN: buffer_store_byte [[K]] -define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, 1 @@ -278,7 +278,7 @@ ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}} ; GCN: buffer_store_byte [[K]] -define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, 2 Index: test/CodeGen/AMDGPU/setcc.ll =================================================================== --- test/CodeGen/AMDGPU/setcc.ll +++ test/CodeGen/AMDGPU/setcc.ll @@ -9,7 +9,7 @@ ; GCN-DAG: v_cmp_eq_u32_e32 ; GCN-DAG: v_cmp_eq_u32_e64 -define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { +define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %result = icmp eq <2 x i32> %a, %b %sext = sext <2 x i1> %result to <2 x i32> store <2 x i32> %sext, <2 x i32> addrspace(1)* %out @@ -26,7 +26,7 @@ ; GCN: v_cmp_eq_u32_e64 ; GCN: v_cmp_eq_u32_e64 ; GCN: v_cmp_eq_u32_e64 -define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr @@ -43,7 +43,7 @@ ; FUNC-LABEL: {{^}}f32_oeq: ; R600: SETE_DX10 ; GCN: v_cmp_eq_f32 -define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp oeq float %a, %b %1 = sext i1 %0 to i32 @@ -54,7 +54,7 @@ ; FUNC-LABEL: {{^}}f32_ogt: ; R600: SETGT_DX10 ; GCN: v_cmp_gt_f32 -define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ogt float %a, %b %1 = sext i1 %0 to i32 @@ -65,7 +65,7 @@ ; FUNC-LABEL: {{^}}f32_oge: ; R600: SETGE_DX10 ; GCN: v_cmp_ge_f32 -define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp oge float %a, %b %1 = sext i1 %0 to i32 @@ -76,7 +76,7 @@ ; FUNC-LABEL: {{^}}f32_olt: ; R600: SETGT_DX10 ; GCN: v_cmp_lt_f32 -define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp olt float %a, %b %1 = sext i1 %0 to i32 @@ -87,7 +87,7 @@ ; FUNC-LABEL: {{^}}f32_ole: ; R600: SETGE_DX10 ; GCN: v_cmp_le_f32 -define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ole float %a, %b %1 = sext i1 %0 to i32 @@ -105,7 +105,7 @@ ; GCN: v_cmp_lg_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_one(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp one float %a, %b %1 = sext i1 %0 to i32 @@ -119,7 +119,7 @@ ; R600-DAG: AND_INT ; R600-DAG: SETNE_INT ; GCN: v_cmp_o_f32 -define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ord float %a, %b %1 = sext i1 %0 to i32 @@ -137,7 +137,7 @@ ; GCN: v_cmp_nlg_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ueq float %a, %b %1 = sext i1 %0 to i32 @@ -150,7 +150,7 @@ ; R600: SETE_DX10 ; GCN: v_cmp_nle_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ugt float %a, %b %1 = sext i1 %0 to i32 @@ -164,7 +164,7 @@ ; GCN: v_cmp_nlt_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp uge float %a, %b %1 = sext i1 %0 to i32 @@ -178,7 +178,7 @@ ; GCN: v_cmp_nge_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ult float %a, %b %1 = sext i1 %0 to i32 @@ -192,7 +192,7 @@ ; GCN: v_cmp_ngt_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ule float %a, %b %1 = sext i1 %0 to i32 @@ -203,7 +203,7 @@ ; FUNC-LABEL: {{^}}f32_une: ; R600: SETNE_DX10 ; GCN: v_cmp_neq_f32 -define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_une(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp une float %a, %b %1 = sext i1 %0 to i32 @@ -217,7 +217,7 @@ ; R600: OR_INT ; R600: SETNE_INT ; GCN: v_cmp_u_f32 -define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp uno float %a, %b %1 = sext i1 %0 to i32 @@ -232,7 +232,7 @@ ; FUNC-LABEL: {{^}}i32_eq: ; R600: SETE_INT ; GCN: v_cmp_eq_u32 -define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp eq i32 %a, %b %1 = sext i1 %0 to i32 @@ -243,7 +243,7 @@ ; FUNC-LABEL: {{^}}i32_ne: ; R600: SETNE_INT ; GCN: v_cmp_ne_u32 -define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp ne i32 %a, %b %1 = sext i1 %0 to i32 @@ -254,7 +254,7 @@ ; FUNC-LABEL: {{^}}i32_ugt: ; R600: SETGT_UINT ; GCN: v_cmp_gt_u32 -define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp ugt i32 %a, %b %1 = sext i1 %0 to i32 @@ -265,7 +265,7 @@ ; FUNC-LABEL: {{^}}i32_uge: ; R600: SETGE_UINT ; GCN: v_cmp_ge_u32 -define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp uge i32 %a, %b %1 = sext i1 %0 to i32 @@ -276,7 +276,7 @@ ; FUNC-LABEL: {{^}}i32_ult: ; R600: SETGT_UINT ; GCN: v_cmp_lt_u32 -define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp ult i32 %a, %b %1 = sext i1 %0 to i32 @@ -287,7 +287,7 @@ ; FUNC-LABEL: {{^}}i32_ule: ; R600: SETGE_UINT ; GCN: v_cmp_le_u32 -define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp ule i32 %a, %b %1 = sext i1 %0 to i32 @@ -298,7 +298,7 @@ ; FUNC-LABEL: {{^}}i32_sgt: ; R600: SETGT_INT ; GCN: v_cmp_gt_i32 -define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp sgt i32 %a, %b %1 = sext i1 %0 to i32 @@ -309,7 +309,7 @@ ; FUNC-LABEL: {{^}}i32_sge: ; R600: SETGE_INT ; GCN: v_cmp_ge_i32 -define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp sge i32 %a, %b %1 = sext i1 %0 to i32 @@ -320,7 +320,7 @@ ; FUNC-LABEL: {{^}}i32_slt: ; R600: SETGT_INT ; GCN: v_cmp_lt_i32 -define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp slt i32 %a, %b %1 = sext i1 %0 to i32 @@ -331,7 +331,7 @@ ; FUNC-LABEL: {{^}}i32_sle: ; R600: SETGE_INT ; GCN: v_cmp_le_i32 -define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp sle i32 %a, %b %1 = sext i1 %0 to i32 @@ -348,7 +348,7 @@ ; GCN-DAG: v_cmp_eq_u32 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, ; GCN: s_endpgm -define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) #0 { +define amdgpu_kernel void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) #0 { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid @@ -369,7 +369,7 @@ ; GCN-DAG: v_cmp_eq_u32 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, ; GCN: s_endpgm -define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) #0 { +define amdgpu_kernel void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) #0 { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid @@ -386,7 +386,7 @@ ; FUNC-LABEL: setcc-i1 ; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 1 ; GCN: s_cmp_eq_u32 [[AND]], 0 -define void @setcc-i1(i32 %in) #0 { +define amdgpu_kernel void @setcc-i1(i32 %in) #0 { %and = and i32 %in, 1 %cmp = icmp eq i32 %and, 0 br i1 %cmp, label %endif, label %if @@ -400,7 +400,7 @@ ; GCN-DAG: v_cmp_ge_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}} ; GCN-DAG: v_cmp_le_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 ; GCN: s_and_b64 s[2:3], [[A]], [[B]] -define void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 { +define amdgpu_kernel void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 { bb0: %tmp5 = fcmp oge float %cond, 0.000000e+00 %tmp7 = fcmp ole float %cond, 1.000000e+00 Index: test/CodeGen/AMDGPU/setcc64.ll =================================================================== --- test/CodeGen/AMDGPU/setcc64.ll +++ test/CodeGen/AMDGPU/setcc64.ll @@ -9,7 +9,7 @@ ; GCN-LABEL: {{^}}f64_oeq: ; GCN: v_cmp_eq_f64 -define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp oeq double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -19,7 +19,7 @@ ; GCN-LABEL: {{^}}f64_ogt: ; GCN: v_cmp_gt_f64 -define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ogt double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -29,7 +29,7 @@ ; GCN-LABEL: {{^}}f64_oge: ; GCN: v_cmp_ge_f64 -define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp oge double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -39,7 +39,7 @@ ; GCN-LABEL: {{^}}f64_olt: ; GCN: v_cmp_lt_f64 -define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp olt double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -49,7 +49,7 @@ ; GCN-LABEL: {{^}}f64_ole: ; GCN: v_cmp_le_f64 -define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ole double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -60,7 +60,7 @@ ; GCN-LABEL: {{^}}f64_one: ; GCN: v_cmp_lg_f64_e32 vcc ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_one(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp one double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -70,7 +70,7 @@ ; GCN-LABEL: {{^}}f64_ord: ; GCN: v_cmp_o_f64 -define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ord double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -81,7 +81,7 @@ ; GCN-LABEL: {{^}}f64_ueq: ; GCN: v_cmp_nlg_f64_e32 vcc ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ueq double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -93,7 +93,7 @@ ; GCN: v_cmp_nle_f64_e32 vcc ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ugt double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -104,7 +104,7 @@ ; GCN-LABEL: {{^}}f64_uge: ; GCN: v_cmp_nlt_f64_e32 vcc ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp uge double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -115,7 +115,7 @@ ; GCN-LABEL: {{^}}f64_ult: ; GCN: v_cmp_nge_f64_e32 vcc ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ult double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -126,7 +126,7 @@ ; GCN-LABEL: {{^}}f64_ule: ; GCN: v_cmp_ngt_f64_e32 vcc ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ule double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -136,7 +136,7 @@ ; GCN-LABEL: {{^}}f64_une: ; GCN: v_cmp_neq_f64 -define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_une(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp une double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -146,7 +146,7 @@ ; GCN-LABEL: {{^}}f64_uno: ; GCN: v_cmp_u_f64 -define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp uno double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -160,7 +160,7 @@ ; GCN-LABEL: {{^}}i64_eq: ; GCN: v_cmp_eq_u64 -define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp eq i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -170,7 +170,7 @@ ; GCN-LABEL: {{^}}i64_ne: ; GCN: v_cmp_ne_u64 -define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp ne i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -180,7 +180,7 @@ ; GCN-LABEL: {{^}}i64_ugt: ; GCN: v_cmp_gt_u64 -define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp ugt i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -190,7 +190,7 @@ ; GCN-LABEL: {{^}}i64_uge: ; GCN: v_cmp_ge_u64 -define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp uge i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -200,7 +200,7 @@ ; GCN-LABEL: {{^}}i64_ult: ; GCN: v_cmp_lt_u64 -define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp ult i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -210,7 +210,7 @@ ; GCN-LABEL: {{^}}i64_ule: ; GCN: v_cmp_le_u64 -define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp ule i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -220,7 +220,7 @@ ; GCN-LABEL: {{^}}i64_sgt: ; GCN: v_cmp_gt_i64 -define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp sgt i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -230,7 +230,7 @@ ; GCN-LABEL: {{^}}i64_sge: ; GCN: v_cmp_ge_i64 -define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp sge i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -240,7 +240,7 @@ ; GCN-LABEL: {{^}}i64_slt: ; GCN: v_cmp_lt_i64 -define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp slt i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -250,7 +250,7 @@ ; GCN-LABEL: {{^}}i64_sle: ; GCN: v_cmp_le_i64 -define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp sle i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 Index: test/CodeGen/AMDGPU/sext-eliminate.ll =================================================================== --- test/CodeGen/AMDGPU/sext-eliminate.ll +++ test/CodeGen/AMDGPU/sext-eliminate.ll @@ -6,7 +6,7 @@ ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] ; EG: SUB_INT {{[* ]*}}[[RES]] ; EG-NOT: BFE -define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) { +define amdgpu_kernel void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) { %sext = sext i1 %a to i32 %res = add i32 %b, %sext store i32 %res, i32 addrspace(1)* %out @@ -18,7 +18,7 @@ ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] ; EG: ADD_INT {{[* ]*}}[[RES]] ; EG-NOT: BFE -define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) { +define amdgpu_kernel void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) { %sext = sext i1 %a to i32 %res = sub i32 %b, %sext store i32 %res, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll =================================================================== --- test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll +++ test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll @@ -11,7 +11,7 @@ ; EG: LSHR {{\*?}} [[ADDR]] ; Works with the align 2 removed -define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { +define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { %c = add <2 x i32> %a, %b %x = shl <2 x i32> %c, %y = ashr <2 x i32> %x, Index: test/CodeGen/AMDGPU/sext-in-reg.ll =================================================================== --- test/CodeGen/AMDGPU/sext-in-reg.ll +++ test/CodeGen/AMDGPU/sext-in-reg.ll @@ -15,7 +15,7 @@ ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] ; EG: LSHR * [[ADDR]] ; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1 -define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 { %shl = shl i32 %in, 31 %sext = ashr i32 %shl, 31 store i32 %sext, i32 addrspace(1)* %out @@ -32,7 +32,7 @@ ; EG: ADD_INT ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal ; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %c = add i32 %a, %b ; add to prevent folding into extload %shl = shl i32 %c, 24 %ashr = ashr i32 %shl, 24 @@ -50,7 +50,7 @@ ; EG: ADD_INT ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal ; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %c = add i32 %a, %b ; add to prevent folding into extload %shl = shl i32 %c, 16 %ashr = ashr i32 %shl, 16 @@ -68,7 +68,7 @@ ; EG: ADD_INT ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal ; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { %c = add <1 x i32> %a, %b ; add to prevent folding into extload %shl = shl <1 x i32> %c, %ashr = ashr <1 x i32> %shl, @@ -82,7 +82,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %c = shl i64 %a, %b %shl = shl i64 %c, 63 %ashr = ashr i64 %shl, 63 @@ -96,7 +96,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %c = shl i64 %a, %b %shl = shl i64 %c, 56 %ashr = ashr i64 %shl, 56 @@ -111,7 +111,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %c = shl i64 %a, %b %shl = shl i64 %c, 48 %ashr = ashr i64 %shl, 48 @@ -125,7 +125,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %c = shl i64 %a, %b %shl = shl i64 %c, 32 %ashr = ashr i64 %shl, 32 @@ -140,7 +140,7 @@ ; XGCN: buffer_store_dword ; XEG: BFE_INT ; XEG: ASHR -; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) #0 { +; define amdgpu_kernel void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) #0 { ; %c = add <1 x i64> %a, %b ; %shl = shl <1 x i64> %c, ; %ashr = ashr <1 x i64> %shl, @@ -160,7 +160,7 @@ ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { +define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -187,7 +187,7 @@ ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { +define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -214,7 +214,7 @@ ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { +define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -238,7 +238,7 @@ ; GCN: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[SHR]]{{\]}} -define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { +define amdgpu_kernel void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -264,7 +264,7 @@ ; EG: LSHL ; EG: ASHR [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %c = add i32 %a, %b %x = shl i32 %c, 6 %y = ashr i32 %x, 7 @@ -287,7 +287,7 @@ ; EG: LSHL ; EG: ASHR [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %c = add <2 x i32> %a, %b %x = shl <2 x i32> %c, %y = ashr <2 x i32> %x, @@ -305,7 +305,7 @@ ; EG: BFE_INT [[RES]] ; EG: BFE_INT [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %c = add <2 x i32> %a, %b ; add to prevent folding into extload %shl = shl <2 x i32> %c, %ashr = ashr <2 x i32> %shl, @@ -326,7 +326,7 @@ ; EG: BFE_INT [[RES]] ; EG: BFE_INT [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 { %c = add <4 x i32> %a, %b ; add to prevent folding into extload %shl = shl <4 x i32> %c, %ashr = ashr <4 x i32> %shl, @@ -343,7 +343,7 @@ ; EG: BFE_INT [[RES]] ; EG: BFE_INT [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %c = add <2 x i32> %a, %b ; add to prevent folding into extload %shl = shl <2 x i32> %c, %ashr = ashr <2 x i32> %shl, @@ -364,7 +364,7 @@ ; EG: BFE_INT [[RES]] ; EG: BFE_INT [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 { %c = add <4 x i32> %a, %b ; add to prevent folding into extload %shl = shl <4 x i32> %c, %ashr = ashr <4 x i32> %shl, @@ -381,7 +381,7 @@ ; EG: BFE_INT [[RES]] ; EG: BFE_INT [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %c = add <2 x i32> %a, %b ; add to prevent folding into extload %shl = shl <2 x i32> %c, %ashr = ashr <2 x i32> %shl, @@ -390,7 +390,7 @@ } ; FUNC-LABEL: {{^}}testcase: -define void @testcase(i8 addrspace(1)* %out, i8 %a) #0 { +define amdgpu_kernel void @testcase(i8 addrspace(1)* %out, i8 %a) #0 { %and_a_1 = and i8 %a, 1 %cmp_eq = icmp eq i8 %and_a_1, 0 %cmp_slt = icmp slt i8 %a, 0 @@ -402,7 +402,7 @@ } ; FUNC-LABEL: {{^}}testcase_3: -define void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 { +define amdgpu_kernel void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 { %and_a_1 = and i8 %a, 1 %cmp_eq = icmp eq i8 %and_a_1, 0 %cmp_slt = icmp slt i8 %a, 0 @@ -418,7 +418,7 @@ ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 -define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 { +define amdgpu_kernel void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 { %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload @@ -431,7 +431,7 @@ ; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32: ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 -define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 { +define amdgpu_kernel void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 { %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload @@ -446,7 +446,7 @@ ; GCN: v_max_i32 ; GCN-NOT: bfe ; GCN: buffer_store_short -define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) #0 { +define amdgpu_kernel void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) #0 { %tmp5 = load i8, i8 addrspace(1)* %src, align 1 %tmp2 = sext i8 %tmp5 to i32 %tmp2.5 = icmp sgt i32 %tmp2, 0 @@ -462,7 +462,7 @@ ; FUNC-LABEL: {{^}}bfe_0_width: ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %load = load i32, i32 addrspace(1)* %ptr, align 4 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone store i32 %bfe, i32 addrspace(1)* %out, align 4 @@ -473,7 +473,7 @@ ; GCN: v_bfe_i32 ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %load = load i32, i32 addrspace(1)* %ptr, align 4 %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone @@ -484,7 +484,7 @@ ; FUNC-LABEL: {{^}}bfe_8_bfe_16: ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 ; GCN: s_endpgm -define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %load = load i32, i32 addrspace(1)* %ptr, align 4 %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone @@ -497,7 +497,7 @@ ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %load = load i32, i32 addrspace(1)* %ptr, align 4 %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone @@ -510,7 +510,7 @@ ; GCN: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %c = add i32 %a, %b ; add to prevent folding into extload %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone %shl = shl i32 %bfe, 24 @@ -520,7 +520,7 @@ } ; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong: -define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %c = add i32 %a, %b ; add to prevent folding into extload %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone %shl = shl i32 %bfe, 24 @@ -533,7 +533,7 @@ ; GCN: buffer_load_sbyte ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { %load = load i8, i8 addrspace(1)* %ptr, align 1 %sext = sext i8 %load to i32 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone @@ -547,7 +547,7 @@ ; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}} ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm -define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { %load = load i8, i8 addrspace(1)* %ptr, align 1 %sext = sext i8 %load to i32 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone @@ -562,7 +562,7 @@ ; GCN-NOT: shl ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 ; GCN: s_endpgm -define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 31 %shr = ashr i32 %shl, 31 @@ -577,7 +577,7 @@ ; GCN-NOT: shr ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1 ; GCN: s_endpgm -define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 30 %shr = ashr i32 %shl, 30 @@ -593,7 +593,7 @@ ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2 ; GCN: s_endpgm -define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %x = load i32, i32 addrspace(1)* %in, align 4 %shl = shl i32 %x, 30 %shr = ashr i32 %shl, 30 @@ -617,7 +617,7 @@ ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[HI]] ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 { +define amdgpu_kernel void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -647,7 +647,7 @@ ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 { +define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -673,7 +673,7 @@ ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 -define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { %ld = load i32, i32 addrspace(2)* %ptr %in = trunc i32 %ld to i16 %shl = shl i16 %in, 15 @@ -692,7 +692,7 @@ ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14 -define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { %ld = load i32, i32 addrspace(2)* %ptr %in = trunc i32 %ld to i16 %shl = shl i16 %in, 14 @@ -706,7 +706,7 @@ ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[VAL]], 0, 1{{$}} ; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]] -define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %gep = getelementptr i16, i16 addrspace(1)* %ptr, i32 %tid %out.gep = getelementptr i16, i16 addrspace(3)* %out, i32 %tid @@ -727,7 +727,7 @@ ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}} ; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]] -define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind { +define amdgpu_kernel void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid @@ -753,7 +753,7 @@ ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}} ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}} -define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { +define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { %shl = shl i16 %in, 14 %sext = ashr i16 %shl, 14 store i16 %sext, i16 addrspace(1)* %out @@ -770,7 +770,7 @@ ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} -define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { +define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { %shl = shl i16 %in, 8 %sext = ashr i16 %shl, 8 store i16 %sext, i16 addrspace(1)* %out @@ -787,7 +787,7 @@ ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}} ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}} -define void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { +define amdgpu_kernel void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { %shl = shl i16 %in, 1 %sext = ashr i16 %shl, 1 store i16 %sext, i16 addrspace(1)* %out @@ -798,7 +798,7 @@ ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]] ; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 15, [[ADD]] ; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 15, [[SHL]] -define void @sext_in_reg_v2i1_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { %c = add <2 x i16> %a, %b ; add to prevent folding into extload %shl = shl <2 x i16> %c, %ashr = ashr <2 x i16> %shl, @@ -813,7 +813,7 @@ ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} -define void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { %c = add <3 x i16> %a, %b ; add to prevent folding into extload %shl = shl <3 x i16> %c, %ashr = ashr <3 x i16> %shl, @@ -825,7 +825,7 @@ ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]] ; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 14, [[ADD]] ; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 14, [[SHL]] -define void @sext_in_reg_v2i2_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v2i2_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { %c = add <2 x i16> %a, %b ; add to prevent folding into extload %shl = shl <2 x i16> %c, %ashr = ashr <2 x i16> %shl, @@ -837,7 +837,7 @@ ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]] ; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 8, [[ADD]] ; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 8, [[SHL]] -define void @sext_in_reg_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { %c = add <2 x i16> %a, %b ; add to prevent folding into extload %shl = shl <2 x i16> %c, %ashr = ashr <2 x i16> %shl, @@ -852,7 +852,7 @@ ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} -define void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { %c = add <3 x i16> %a, %b ; add to prevent folding into extload %shl = shl <3 x i16> %c, %ashr = ashr <3 x i16> %shl, Index: test/CodeGen/AMDGPU/sgpr-control-flow.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -13,7 +13,7 @@ ; SI: s_sub -define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { entry: %0 = icmp eq i32 %a, 0 br i1 %0, label %if, label %else @@ -52,7 +52,7 @@ ; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]] ; SI: buffer_store_dword ; SI-NEXT: s_endpgm -define void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { entry: %0 = icmp eq i32 %a, 0 br i1 %0, label %if, label %else @@ -79,7 +79,7 @@ ; SI: s_add_i32 [[SGPR:s[0-9]+]] ; SI-NOT: s_add_i32 [[SGPR]] -define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { +define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %tid_f = uitofp i32 %tid to float @@ -116,7 +116,7 @@ ; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]] ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]] ; SI: buffer_store_dword [[RESULT]] -define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { +define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp1 = icmp eq i32 %tid, 0 Index: test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll +++ test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll @@ -6,7 +6,7 @@ ; SI-LABEL: {{^}}test_dup_operands: ; SI: v_add_i32_e32 -define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) { +define amdgpu_kernel void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) { %a = load <2 x i32>, <2 x i32> addrspace(1)* %in %lo = extractelement <2 x i32> %a, i32 0 %hi = extractelement <2 x i32> %a, i32 1 Index: test/CodeGen/AMDGPU/sgpr-copy.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-copy.ll +++ test/CodeGen/AMDGPU/sgpr-copy.ll @@ -265,7 +265,7 @@ ; CHECK: buffer_load_dword ; CHECK: v_add ; CHECK: s_endpgm -define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) { +define amdgpu_kernel void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) { entry: %tmp = load float, float addrspace(1)* %in0 %tmp1 = fcmp oeq float %tmp, 0.000000e+00 Index: test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll =================================================================== --- test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -11,7 +11,7 @@ ; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm -define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x @@ -33,7 +33,7 @@ ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm -define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x @@ -55,7 +55,7 @@ ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm -define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x @@ -77,7 +77,7 @@ ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm -define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x @@ -100,7 +100,7 @@ ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm -define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x Index: test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll =================================================================== --- test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -8,7 +8,7 @@ ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -25,7 +25,7 @@ ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -41,7 +41,7 @@ ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -57,7 +57,7 @@ ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -73,7 +73,7 @@ ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -89,7 +89,7 @@ ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -105,7 +105,7 @@ ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -121,7 +121,7 @@ ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -137,7 +137,7 @@ ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -155,7 +155,7 @@ ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -171,7 +171,7 @@ ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -188,7 +188,7 @@ ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -204,7 +204,7 @@ ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30 ; GCN-DAG: v_mov_b32_e32 v[[BFE:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -220,7 +220,7 @@ ; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31 ; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} -define void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -236,7 +236,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] ; GCN: buffer_store_dword v[[SHIFT]] -define void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x @@ -252,7 +252,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}} ; GCN: buffer_store_dword [[BFE]] -define void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x @@ -268,7 +268,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}} ; GCN: buffer_store_dword [[BFE]] -define void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x @@ -286,7 +286,7 @@ ; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]] ; GCN-NOT: v[[SHRLO]] ; GCN: buffer_store_dword v[[SHRLO]] -define void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x @@ -306,7 +306,7 @@ ; GCN-NOT: v[[SHRLO]] ; GCN-NOT: v[[SHRHI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} -define void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -327,7 +327,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -347,7 +347,7 @@ ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3 ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}} -define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -365,7 +365,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} ; GCN: buffer_store_dword v[[ZERO]] -define void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out0.gep = getelementptr i64, i64 addrspace(1)* %out0, i32 %id.x Index: test/CodeGen/AMDGPU/shift-i64-opts.ll =================================================================== --- test/CodeGen/AMDGPU/shift-i64-opts.ll +++ test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -8,7 +8,7 @@ ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 3, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = lshr i64 %val, 35 store i64 %shl, i64 addrspace(1)* %out @@ -20,7 +20,7 @@ ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 31, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = lshr i64 %val, 63 store i64 %shl, i64 addrspace(1)* %out @@ -32,7 +32,7 @@ ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 1, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = lshr i64 %val, 33 store i64 %shl, i64 addrspace(1)* %out @@ -43,7 +43,7 @@ ; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = lshr i64 %val, 32 store i64 %shl, i64 addrspace(1)* %out @@ -58,7 +58,7 @@ ; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[HI]], 8, 23 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} -define void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %and = and i64 %val, 9223372036854775807 ; 0x7fffffffffffffff %shl = lshr i64 %and, 40 @@ -73,7 +73,7 @@ ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 3, [[VAL]] ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 35 store i64 %shl, i64 addrspace(1)* %out @@ -84,7 +84,7 @@ ; GCN-DAG: buffer_load_dword v[[HI:[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 32 store i64 %shl, i64 addrspace(1)* %out @@ -96,7 +96,7 @@ ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 31, [[VAL]] ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 63 store i64 %shl, i64 addrspace(1)* %out @@ -106,7 +106,7 @@ ; ashr (i64 x), 63 => (ashr lo(x), 31), lo(x) ; GCN-LABEL: {{^}}ashr_i64_const_32: -define void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = ashr i64 %val, 32 store i64 %shl, i64 addrspace(1)* %out @@ -114,7 +114,7 @@ } ; GCN-LABEL: {{^}}ashr_i64_const_63: -define void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = ashr i64 %val, 63 store i64 %shl, i64 addrspace(1)* %out @@ -125,7 +125,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 31, [[VAL]] ; GCN: buffer_store_dword [[SHL]] -define void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 31 %trunc = trunc i64 %shl to i32 @@ -137,7 +137,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]] ; GCN: buffer_store_short [[SHL]] -define void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 15 %trunc = trunc i64 %shl to i16 @@ -149,7 +149,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]] ; GCN: buffer_store_short [[SHL]] -define void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) { %val = load i32, i32 addrspace(1)* %in %shl = shl i32 %val, 15 %trunc = trunc i32 %shl to i16 @@ -161,7 +161,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 7, [[VAL]] ; GCN: buffer_store_byte [[SHL]] -define void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 7 %trunc = trunc i64 %shl to i8 @@ -174,7 +174,7 @@ ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]] ; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 2, [[SHL]] ; GCN: buffer_store_byte [[AND]] -define void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 1 %trunc = trunc i64 %shl to i2 @@ -186,7 +186,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]] ; GCN: buffer_store_dword [[SHL]] -define void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 1 %trunc = trunc i64 %shl to i32 @@ -198,7 +198,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[VAL]] ; GCN: buffer_store_dword [[SHL]] -define void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 16 %trunc = trunc i64 %shl to i32 @@ -209,7 +209,7 @@ ; GCN-LABEL: {{^}}trunc_shl_33_i32_i64: ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[ZERO]] -define void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 33 %trunc = trunc i64 %shl to i32 @@ -222,7 +222,7 @@ ; GCN-DAG: v_lshlrev_b32_e32 v[[RESHI:[0-9]+]], 16, v{{[0-9]+}} ; GCN-DAG: v_lshlrev_b32_e32 v[[RESLO:[0-9]+]], 16, v[[LO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}} -define void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in %shl = shl <2 x i64> %val, %trunc = trunc <2 x i64> %shl to <2 x i32> @@ -235,7 +235,7 @@ ; GCN: v_lshl_b64 v{{\[}}[[RESLO:[0-9]+]]:[[RESHI:[0-9]+]]{{\]}}, [[VAL]], 31 ; GCN: buffer_store_dword v[[RESLO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}} -define void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 31 %trunc = trunc i64 %shl to i32 Index: test/CodeGen/AMDGPU/shl.ll =================================================================== --- test/CodeGen/AMDGPU/shl.ll +++ test/CodeGen/AMDGPU/shl.ll @@ -17,7 +17,7 @@ ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr @@ -44,7 +44,7 @@ ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr @@ -57,7 +57,7 @@ ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %a = load i16, i16 addrspace(1)* %in %b = load i16, i16 addrspace(1)* %b_ptr @@ -70,7 +70,7 @@ ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -define void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) { +define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) { %a = load i16, i16 addrspace(1)* %in %result = shl i16 %a, %b store i16 %result, i16 addrspace(1)* %out @@ -81,7 +81,7 @@ ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -define void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) { +define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) { %a = load i16, i16 addrspace(1)* %in %b.add = add i16 %b, 3 %result = shl i16 %a, %b.add @@ -92,7 +92,7 @@ ; GCN-LABEL: {{^}}shl_i16_computed_amount: ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 3, v{{[0-9]+}} ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, [[ADD]], v{{[0-9]+}} -define void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %tid %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid @@ -107,7 +107,7 @@ ; GCN-LABEL: {{^}}shl_i16_i_s: ; GCN: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 12 -define void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) { +define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) { %result = shl i16 %a, 12 store i16 %result, i16 addrspace(1)* %out ret void @@ -116,7 +116,7 @@ ; GCN-LABEL: {{^}}shl_v2i16: ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -133,7 +133,7 @@ ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid %gep.out = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid @@ -160,7 +160,7 @@ ; GCN-LABEL: {{^}}shl_i64: ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} ; VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 %a = load i64, i64 addrspace(1)* %in %b = load i64, i64 addrspace(1)* %b_ptr @@ -199,7 +199,7 @@ ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr @@ -262,7 +262,7 @@ ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr @@ -277,7 +277,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) { %result = shl i64 %a, 32 store i64 %result, i64 addrspace(1)* %out ret void @@ -287,7 +287,7 @@ ; GCN-DAG: buffer_load_dword v[[LO_A:[0-9]+]], ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}} -define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid @@ -299,7 +299,7 @@ ; FUNC-LABEL: {{^}}s_shl_constant_i64 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -define void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) { %shl = shl i64 281474976710655, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -311,7 +311,7 @@ ; SI-DAG: s_movk_i32 s[[KHI:[0-9]+]], 0x11e{{$}} ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]] ; SI: buffer_store_dwordx2 -define void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %shl = shl i64 1231231234567, %a store i64 %shl, i64 addrspace(1)* %out, align 8 @@ -323,7 +323,7 @@ ; SI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x12d687{{$}} ; SI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0{{$}} ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]] -define void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %shl = shl i64 1234567, %a store i64 %shl, i64 addrspace(1)* %out, align 8 @@ -332,7 +332,7 @@ ; FUNC-LABEL: {{^}}v_shl_inline_imm_64_i64: ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, 64, {{v[0-9]+}} -define void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %shl = shl i64 64, %a store i64 %shl, i64 addrspace(1)* %out, align 8 @@ -341,7 +341,7 @@ ; FUNC-LABEL: {{^}}s_shl_inline_imm_64_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 64, s{{[0-9]+}} -define void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 64, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -349,7 +349,7 @@ ; FUNC-LABEL: {{^}}s_shl_inline_imm_1_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1, s{{[0-9]+}} -define void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 1, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -357,7 +357,7 @@ ; FUNC-LABEL: {{^}}s_shl_inline_imm_1.0_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{[0-9]+}} -define void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 4607182418800017408, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -365,7 +365,7 @@ ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_1.0_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{[0-9]+}} -define void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 13830554455654793216, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -373,7 +373,7 @@ ; FUNC-LABEL: {{^}}s_shl_inline_imm_0.5_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 0.5, s{{[0-9]+}} -define void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 4602678819172646912, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -381,7 +381,7 @@ ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_0.5_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -0.5, s{{[0-9]+}} -define void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 13826050856027422720, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -389,7 +389,7 @@ ; FUNC-LABEL: {{^}}s_shl_inline_imm_2.0_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 2.0, s{{[0-9]+}} -define void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 4611686018427387904, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -397,7 +397,7 @@ ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_2.0_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -2.0, s{{[0-9]+}} -define void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 13835058055282163712, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -405,7 +405,7 @@ ; FUNC-LABEL: {{^}}s_shl_inline_imm_4.0_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 4.0, s{{[0-9]+}} -define void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 4616189618054758400, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -413,7 +413,7 @@ ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_4.0_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -4.0, s{{[0-9]+}} -define void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 13839561654909534208, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -427,7 +427,7 @@ ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}} ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}} -define void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 1082130432, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -439,7 +439,7 @@ ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}} ; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]] ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}, s{{[0-9]+}} -define void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 -1065353216, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -450,7 +450,7 @@ ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}} -define void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 4647714815446351872, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -460,7 +460,7 @@ ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}} -define void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 13871086852301127680, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -468,7 +468,7 @@ ; FUNC-LABEL: {{^}}test_mul2: ; GCN: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1 -define void @test_mul2(i32 %p) { +define amdgpu_kernel void @test_mul2(i32 %p) { %i = mul i32 %p, 2 store volatile i32 %i, i32 addrspace(1)* undef ret void Index: test/CodeGen/AMDGPU/shl.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/shl.v2i16.ll +++ test/CodeGen/AMDGPU/shl.v2i16.ll @@ -13,7 +13,7 @@ ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CIVI: v_or_b32_e32 -define void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { +define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = shl <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out ret void @@ -38,7 +38,7 @@ ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -55,7 +55,7 @@ ; GFX9: s_load_dword [[RHS:s[0-9]+]] ; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] -define void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { +define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -70,7 +70,7 @@ ; GFX9: s_load_dword [[LHS:s[0-9]+]] ; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] -define void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { +define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -84,7 +84,7 @@ ; GCN-LABEL: {{^}}shl_imm_v_v2i16: ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8 -define void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -98,7 +98,7 @@ ; GCN-LABEL: {{^}}shl_v_imm_v2i16: ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]] -define void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -115,7 +115,7 @@ ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: {{buffer|flat}}_store_dwordx2 -define void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext @@ -133,7 +133,7 @@ ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GCN: {{buffer|flat}}_store_dwordx2 -define void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext Index: test/CodeGen/AMDGPU/shl_add_constant.ll =================================================================== --- test/CodeGen/AMDGPU/shl_add_constant.ll +++ test/CodeGen/AMDGPU/shl_add_constant.ll @@ -9,7 +9,7 @@ ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 36, [[REG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x %val = load i32, i32 addrspace(1)* %ptr, align 4 @@ -25,7 +25,7 @@ ; SI-DAG: buffer_store_dword [[ADDREG]] ; SI-DAG: buffer_store_dword [[SHLREG]] ; SI: s_endpgm -define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x %val = load i32, i32 addrspace(1)* %ptr, align 4 @@ -43,7 +43,7 @@ ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xf9c, [[REG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x %val = load i32, i32 addrspace(1)* %ptr, align 4 @@ -61,7 +61,7 @@ ; SI: s_addk_i32 [[RESULT]], 0x3d8 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] ; SI: buffer_store_dword [[VRESULT]] -define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { %add.0 = add i32 %x, 123 %shl = shl i32 %add.0, 3 %add.1 = add i32 %shl, %y @@ -78,7 +78,7 @@ ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]] ; SI: buffer_store_dword [[VRESULT]] -define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { %add.0 = add i32 %x, 123 %shl = shl i32 %add.0, 3 %add.1 = add i32 %y, %shl Index: test/CodeGen/AMDGPU/shl_add_ptr.ll =================================================================== --- test/CodeGen/AMDGPU/shl_add_ptr.ll +++ test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -19,7 +19,7 @@ ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8 ; SI: s_endpgm -define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 @@ -39,7 +39,7 @@ ; SI-DAG: buffer_store_dword [[RESULT]] ; SI-DAG: buffer_store_dword [[ADDUSE]] ; SI: s_endpgm -define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 @@ -55,7 +55,7 @@ ; SI-LABEL: {{^}}load_shl_base_lds_max_offset ; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535 ; SI: s_endpgm -define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 65535 %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0 @@ -73,7 +73,7 @@ ; SI: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 ; SI: s_endpgm -define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 64 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 @@ -89,7 +89,7 @@ ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 @@ -104,7 +104,7 @@ @lds2 = addrspace(3) global [512 x i32] undef, align 4 -; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +; define amdgpu_kernel void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 ; %idx.0 = add nsw i32 %tid.x, 2 ; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -119,7 +119,7 @@ ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 { +define amdgpu_kernel void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -134,7 +134,7 @@ ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -148,7 +148,7 @@ ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -162,7 +162,7 @@ ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -176,7 +176,7 @@ ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -190,7 +190,7 @@ ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -204,7 +204,7 @@ ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -214,7 +214,7 @@ ret void } -; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +; define amdgpu_kernel void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 ; %idx.0 = add nsw i32 %tid.x, 2 ; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -228,7 +228,7 @@ ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -242,7 +242,7 @@ ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -256,7 +256,7 @@ ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -270,7 +270,7 @@ ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 Index: test/CodeGen/AMDGPU/shrink-add-sub-constant.ll =================================================================== --- test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -7,7 +7,7 @@ ; GCN-LABEL: {{^}}v_test_i32_x_sub_64: ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] ; GCN: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]] -define void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -23,7 +23,7 @@ ; GCN: {{buffer|flat}}_load_dword [[Y:v[0-9]+]] ; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]] ; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]] -define void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -40,7 +40,7 @@ ; GCN-LABEL: {{^}}v_test_i32_64_sub_x: ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]] -define void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -54,7 +54,7 @@ ; GCN-LABEL: {{^}}v_test_i32_x_sub_65: ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xffffffbf, [[X]] -define void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -68,7 +68,7 @@ ; GCN-LABEL: {{^}}v_test_i32_65_sub_x: ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x41, [[X]] -define void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -82,7 +82,7 @@ ; GCN-LABEL: {{^}}v_test_i32_x_sub_neg16: ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 16, [[X]] -define void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -96,7 +96,7 @@ ; GCN-LABEL: {{^}}v_test_i32_neg16_sub_x: ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, -16, [[X]] -define void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -110,7 +110,7 @@ ; GCN-LABEL: {{^}}v_test_i32_x_sub_neg17: ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 17, [[X]] -define void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -124,7 +124,7 @@ ; GCN-LABEL: {{^}}v_test_i32_neg17_sub_x: ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0xffffffef, [[X]] -define void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -138,7 +138,7 @@ ; GCN-LABEL: {{^}}s_test_i32_x_sub_64: ; GCN: s_load_dword [[X:s[0-9]+]] ; GCN: s_sub_i32 s{{[0-9]+}}, [[X]], 64 -define void @s_test_i32_x_sub_64(i32 %x) #0 { +define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { %result = sub i32 %x, 64 call void asm sideeffect "; use $0", "s"(i32 %result) ret void @@ -147,7 +147,7 @@ ; GCN-LABEL: {{^}}v_test_i16_x_sub_64: ; VI: {{buffer|flat}}_load_ushort [[X:v[0-9]+]] ; VI: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]] -define void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext @@ -166,7 +166,7 @@ ; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]] ; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]] -define void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext Index: test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir =================================================================== --- test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir +++ test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir @@ -7,7 +7,7 @@ # resume crashes --- | - define void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + define amdgpu_kernel void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -20,7 +20,7 @@ ret void } - define void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + define amdgpu_kernel void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -33,7 +33,7 @@ ret void } - define void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + define amdgpu_kernel void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -46,7 +46,7 @@ ret void } - define void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + define amdgpu_kernel void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -59,7 +59,7 @@ ret void } - define void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + define amdgpu_kernel void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -72,7 +72,7 @@ ret void } - define void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + define amdgpu_kernel void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext Index: test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll +++ test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll @@ -9,7 +9,7 @@ ; GCN: s_cbranch_vccnz ; GCN-NOT: s_endpgm ; GCN: .Lfunc_end0 -define void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { +define amdgpu_kernel void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() br label %bb1 @@ -40,7 +40,7 @@ ; GCN: s_cbranch_scc1 ; GCN: s_endpgm ; GCN: .Lfunc_end1 -define void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { +define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() br label %bb1 Index: test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll +++ test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll @@ -11,7 +11,7 @@ ; GCN: s_and_saveexec_b64 ; GCN-NOT: s_endpgm ; GCN: .Lfunc_end0 -define void @annotate_unreachable(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { +define amdgpu_kernel void @annotate_unreachable(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() br label %bb1 Index: test/CodeGen/AMDGPU/si-annotate-cf.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cf.ll +++ test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -10,7 +10,7 @@ ; SI: s_andn2_b64 ; s_cbranch_execnz [[LOOP_LABEL]] ; SI: s_endpgm -define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) { main_body: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %0 = and i32 %a, %tid @@ -40,7 +40,7 @@ ; SI: s_cbranch_execnz [[LOOP_LABEL]] ; SI: s_endpgm -define void @phi_cond_outside_loop(i32 %b) { +define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { entry: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %0 = icmp eq i32 %tid , 0 @@ -68,7 +68,7 @@ ; CHECK-LABEL: {{^}}switch_unreachable: ; CHECK-NOT: s_endpgm ; CHECK: .Lfunc_end2 -define void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind { +define amdgpu_kernel void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind { centry: switch i32 %x, label %sw.default [ i32 0, label %sw.bb @@ -100,7 +100,7 @@ ; SI: [[ENDPGM]]: ; SI: s_endpgm -define void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind { +define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind { entry: %cmp = icmp sgt i32 %c0, 0 br label %while.cond.outer Index: test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll +++ test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll @@ -6,7 +6,7 @@ ; CHECK s_or_b64 exec, exec ; CHECK s_andn2_b64 exec, exec ; CHECK s_cbranch_execnz -define void @test(i32 %arg, i32 %arg1) { +define amdgpu_kernel void @test(i32 %arg, i32 %arg1) { bb: %tmp = icmp ne i32 %arg, 0 %tmp7 = icmp ne i32 %arg1, 0 Index: test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir =================================================================== --- test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir +++ test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir @@ -1,7 +1,7 @@ # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies %s -o - | FileCheck %s -check-prefixes=GCN --- | - define void @phi_visit_order() { ret void } + define amdgpu_kernel void @phi_visit_order() { ret void } name: phi_visit_order tracksRegLiveness: true Index: test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll =================================================================== --- test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll +++ test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll @@ -5,7 +5,7 @@ ; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load volatile i32, i32 addrspace(1)* %in Index: test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll =================================================================== --- test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll +++ test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll @@ -14,7 +14,7 @@ ; GCN: [[UNREACHABLE]]: ; GCN: ds_write_b32 ; GCN: s_waitcnt -define void @lower_control_flow_unreachable_terminator() #0 { +define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 { bb: %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y() %tmp63 = icmp eq i32 %tmp15, 32 @@ -41,7 +41,7 @@ ; GCN-NEXT: s_or_b64 exec, exec ; GCN: ds_write_b32 ; GCN: s_waitcnt -define void @lower_control_flow_unreachable_terminator_swap_block_order() #0 { +define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 { bb: %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y() %tmp63 = icmp eq i32 %tmp15, 32 Index: test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll =================================================================== --- test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -24,7 +24,7 @@ ; SMEM: s_dcache_wb ; ALL: s_endpgm -define void @test(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" () Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -13,7 +13,7 @@ ; FUNC-LABEL: @reorder_local_load_global_store_local_load ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3 ; CI: buffer_store_dword -define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { +define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 @@ -33,7 +33,7 @@ ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 ; CI: buffer_store_dword ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 -define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { +define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 @@ -53,7 +53,7 @@ ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 ; CI: buffer_store_dword -define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { +define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 @@ -77,7 +77,7 @@ ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3 ; CI: buffer_store_dword -define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { +define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 @@ -100,7 +100,7 @@ ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3 ; CI: ds_write_b32 ; CI: buffer_store_dword -define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 { +define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 { %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 @@ -122,7 +122,7 @@ ; CI: s_load_dword ; CI: ds_write_b32 ; CI: buffer_store_dword -define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 { +define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 { %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 @@ -141,7 +141,7 @@ ; CI: buffer_load_dword ; CI: buffer_load_dword ; CI: buffer_store_dword -define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 { +define amdgpu_kernel void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 { %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1 %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 3 @@ -161,7 +161,7 @@ ; CI-DAG: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408 ; CI: buffer_store_dword ; CI: s_endpgm -define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 { +define amdgpu_kernel void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 { %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3 %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100 %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 102 @@ -187,7 +187,7 @@ ; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408 ; CI: buffer_store_dword ; CI: s_endpgm -define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { +define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3 %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100 %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 102 @@ -221,7 +221,7 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:36{{$}} ; GCN-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:52{{$}} -define void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 { +define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 Index: test/CodeGen/AMDGPU/si-vector-hang.ll =================================================================== --- test/CodeGen/AMDGPU/si-vector-hang.ll +++ test/CodeGen/AMDGPU/si-vector-hang.ll @@ -12,7 +12,7 @@ ; CHECK: buffer_store_byte ; ModuleID = 'radeon' -define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 { +define amdgpu_kernel void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 { entry: %0 = load i8, i8 addrspace(1)* %in0, align 1 %1 = insertelement <8 x i8> undef, i8 %0, i32 0 Index: test/CodeGen/AMDGPU/sign_extend.ll =================================================================== --- test/CodeGen/AMDGPU/sign_extend.ll +++ test/CodeGen/AMDGPU/sign_extend.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}s_sext_i1_to_i32: ; GCN: v_cndmask_b32_e64 ; GCN: s_endpgm -define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -14,7 +14,7 @@ ; GCN-LABEL: {{^}}test_s_sext_i32_to_i64: ; GCN: s_ashr_i32 ; GCN: s_endpg -define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { +define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { entry: %mul = mul i32 %a, %b %add = add i32 %mul, %c @@ -28,7 +28,7 @@ ; GCN: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}} ; GCN: s_endpgm -define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i64 store i64 %sext, i64 addrspace(1)* %out, align 8 @@ -38,7 +38,7 @@ ; GCN-LABEL: {{^}}s_sext_i32_to_i64: ; GCN: s_ashr_i32 ; GCN: s_endpgm -define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { %sext = sext i32 %a to i64 store i64 %sext, i64 addrspace(1)* %out, align 8 ret void @@ -47,7 +47,7 @@ ; GCN-LABEL: {{^}}v_sext_i32_to_i64: ; GCN: v_ashr ; GCN: s_endpgm -define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %sext = sext i32 %val to i64 store i64 %sext, i64 addrspace(1)* %out, align 8 @@ -56,7 +56,7 @@ ; GCN-LABEL: {{^}}s_sext_i16_to_i64: ; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000 -define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { +define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { %sext = sext i16 %a to i64 store i64 %sext, i64 addrspace(1)* %out, align 8 ret void @@ -65,7 +65,7 @@ ; GCN-LABEL: {{^}}s_sext_i1_to_i16: ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 ; GCN-NEXT: buffer_store_short [[RESULT]] -define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i16 store i16 %sext, i16 addrspace(1)* %out @@ -79,7 +79,7 @@ ; GCN-LABEL: {{^}}s_sext_i1_to_i16_with_and: ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 ; GCN-NEXT: buffer_store_short [[RESULT]] -define void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { +define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { %cmp0 = icmp eq i32 %a, %b %cmp1 = icmp eq i32 %c, %d %cmp = and i1 %cmp0, %cmp1 @@ -91,7 +91,7 @@ ; GCN-LABEL: {{^}}v_sext_i1_to_i16_with_and: ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 ; GCN-NEXT: buffer_store_short [[RESULT]] -define void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { +define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %cmp0 = icmp eq i32 %a, %tid %cmp1 = icmp eq i32 %b, %c @@ -130,7 +130,7 @@ ; GCN-DAG: buffer_store_dword [[VEXT3]] ; GCN: s_endpgm -define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind { %cast = bitcast i32 %a to <4 x i8> %ext = sext <4 x i8> %cast to <4 x i32> %elt0 = extractelement <4 x i32> %ext, i32 0 @@ -162,7 +162,7 @@ ; GCN: buffer_store_dword [[EXT1]] ; GCN: buffer_store_dword [[EXT2]] ; GCN: buffer_store_dword [[EXT3]] -define void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %a = load i32, i32 addrspace(1)* %in %cast = bitcast i32 %a to <4 x i8> %ext = sext <4 x i8> %cast to <4 x i32> @@ -184,7 +184,7 @@ ; GCN-DAG: s_sext_i32_i16 ; GCN-DAG: s_sext_i32_i16 ; GCN: s_endpgm -define void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind { +define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind { %cast = bitcast i64 %a to <4 x i16> %ext = sext <4 x i16> %cast to <4 x i32> %elt0 = extractelement <4 x i32> %ext, i32 0 @@ -206,7 +206,7 @@ ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; GCN: s_endpgm -define void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { %a = load i64, i64 addrspace(1)* %in %cast = bitcast i64 %a to <4 x i16> %ext = sext <4 x i16> %cast to <4 x i32> Index: test/CodeGen/AMDGPU/sint_to_fp.f64.ll =================================================================== --- test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -4,7 +4,7 @@ ; SI-LABEL: {{^}}sint_to_fp_i32_to_f64 ; SI: v_cvt_f64_i32_e32 -define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { %result = sitofp i32 %in to double store double %result, double addrspace(1)* %out ret void @@ -19,7 +19,7 @@ ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}} ; SI: s_endpgm -define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { %cmp = icmp eq i32 %in, 0 %fp = sitofp i1 %cmp to double store double %fp, double addrspace(1)* %out, align 4 @@ -31,14 +31,14 @@ ; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]] ; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm -define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) { +define amdgpu_kernel void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) { %fp = sitofp i1 %in to double store double %fp, double addrspace(1)* %out, align 8 ret void } ; SI-LABEL: @s_sint_to_fp_i64_to_f64 -define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { %result = sitofp i64 %in to double store double %result, double addrspace(1)* %out ret void @@ -51,7 +51,7 @@ ; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %val = load i64, i64 addrspace(1)* %gep, align 8 Index: test/CodeGen/AMDGPU/sint_to_fp.i64.ll =================================================================== --- test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -4,7 +4,7 @@ ; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600 ; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f16: -define void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 { +define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 { %result = sitofp i64 %in to half store half %result, half addrspace(1)* %out ret void @@ -28,7 +28,7 @@ ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]], ; GCN: v_cvt_f16_f32_e32 [[SIGN_SEL_F16:v[0-9]+]], [[SIGN_SEL]] ; GCN: {{buffer|flat}}_store_short {{.*}}[[SIGN_SEL_F16]] -define void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -39,7 +39,7 @@ } ; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f32: -define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { +define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { %result = sitofp i64 %in to float store float %result, float addrspace(1)* %out ret void @@ -62,7 +62,7 @@ ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]], ; GCN: {{buffer|flat}}_store_dword {{.*}}[[SIGN_SEL]] -define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -74,14 +74,14 @@ ; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64_to_v2f32: ; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1, -define void @s_sint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{ +define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{ %result = sitofp <2 x i64> %in to <2 x float> store <2 x float> %result, <2 x float> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64_to_v4f32: -define void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid @@ -93,14 +93,14 @@ ; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64_to_v2f16: ; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1, -define void @s_sint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{ +define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{ %result = sitofp <2 x i64> %in to <2 x half> store <2 x half> %result, <2 x half> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64_to_v4f16: -define void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid Index: test/CodeGen/AMDGPU/sint_to_fp.ll =================================================================== --- test/CodeGen/AMDGPU/sint_to_fp.ll +++ test/CodeGen/AMDGPU/sint_to_fp.ll @@ -6,7 +6,7 @@ ; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}} ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z -define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 { %result = sitofp i32 %in to float store float %result, float addrspace(1)* %out ret void @@ -16,7 +16,7 @@ ; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{v[0-9]+$}} ; R600: INT_TO_FLT -define void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -32,7 +32,7 @@ ; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X -define void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0{ +define amdgpu_kernel void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0{ %result = sitofp <2 x i32> %in to <2 x float> store <2 x float> %result, <2 x float> addrspace(1)* %out ret void @@ -49,7 +49,7 @@ ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %value = load <4 x i32>, <4 x i32> addrspace(1) * %in %result = sitofp <4 x i32> %value to <4 x float> store <4 x float> %result, <4 x float> addrspace(1)* %out @@ -66,7 +66,7 @@ ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid @@ -81,7 +81,7 @@ ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 { %cmp = icmp eq i32 %in, 0 %fp = uitofp i1 %cmp to float store float %fp, float addrspace(1)* %out @@ -92,7 +92,7 @@ ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0 ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 { +define amdgpu_kernel void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 { %fp = sitofp i1 %in to float store float %fp, float addrspace(1)* %out ret void @@ -105,7 +105,7 @@ ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0 ; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]] ; SI: s_endpgm -define void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid Index: test/CodeGen/AMDGPU/sitofp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/sitofp.f16.ll +++ test/CodeGen/AMDGPU/sitofp.f16.ll @@ -7,7 +7,7 @@ ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @sitofp_i16_to_f16( +define amdgpu_kernel void @sitofp_i16_to_f16( half addrspace(1)* %r, i16 addrspace(1)* %a) { entry: @@ -23,7 +23,7 @@ ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @sitofp_i32_to_f16( +define amdgpu_kernel void @sitofp_i32_to_f16( half addrspace(1)* %r, i32 addrspace(1)* %a) { entry: @@ -45,7 +45,7 @@ ; GCN-DAG: v_or_b32_e32 ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @sitofp_v2i16_to_v2f16( +define amdgpu_kernel void @sitofp_v2i16_to_v2f16( <2 x half> addrspace(1)* %r, <2 x i16> addrspace(1)* %a) { entry: @@ -65,7 +65,7 @@ ; GCN-DAG: v_or_b32_e32 ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @sitofp_v2i32_to_v2f16( +define amdgpu_kernel void @sitofp_v2i32_to_v2f16( <2 x half> addrspace(1)* %r, <2 x i32> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/smed3.ll =================================================================== --- test/CodeGen/AMDGPU/smed3.ll +++ test/CodeGen/AMDGPU/smed3.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32: ; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 -define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -25,7 +25,7 @@ ; GCN-LABEL: {{^}}v_test_smed3_multi_use_r_i_i_i32: ; GCN: v_max_i32 ; GCN: v_min_i32 -define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -45,7 +45,7 @@ ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_constant_order_i32: ; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} ; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} -define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -64,7 +64,7 @@ ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_sign_mismatch_i32: ; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} ; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} -define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -83,7 +83,7 @@ ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i64: ; GCN: v_cmp_lt_i64 ; GCN: v_cmp_gt_i64 -define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid @@ -102,7 +102,7 @@ ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16: ; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 -define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid @@ -174,7 +174,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -186,7 +186,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_1: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -198,7 +198,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_2: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -210,7 +210,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_3: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -222,7 +222,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_4: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -234,7 +234,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_5: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -246,7 +246,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_6: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -258,7 +258,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_7: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -270,7 +270,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_8: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -282,7 +282,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_9: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -294,7 +294,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_10: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -306,7 +306,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_11: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -318,7 +318,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_12: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -330,7 +330,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_13: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -342,7 +342,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_14: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -354,7 +354,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_15: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -370,7 +370,7 @@ ; GCN: s_sext_i32_i16 ; GCN: s_sext_i32_i16 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 { bb: %tmp0 = call i16 @smin16(i16 %x, i16 %y) %tmp1 = call i16 @smax16(i16 %x, i16 %y) @@ -385,7 +385,7 @@ ; GCN: s_sext_i32_i8 ; GCN: s_sext_i32_i8 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 { bb: %tmp0 = call i8 @smin8(i8 %x, i8 %y) %tmp1 = call i8 @smax8(i8 %x, i8 %y) @@ -397,7 +397,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_0: ; GCN-NOT: v_med3_i32 -define void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -410,7 +410,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_1: ; GCN-NOT: v_med3_i32 -define void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -423,7 +423,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_2: ; GCN-NOT: v_med3_i32 -define void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -436,7 +436,7 @@ ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_result: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -457,7 +457,7 @@ ; VI: v_max_i16 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 { +define amdgpu_kernel void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid Index: test/CodeGen/AMDGPU/sminmax.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.ll +++ test/CodeGen/AMDGPU/sminmax.ll @@ -7,7 +7,7 @@ ; GCN: s_add_i32 ; EG: MAX_INT -define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind { +define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind { %neg = sub i32 0, %val %cond = icmp sgt i32 %val, %neg %res = select i1 %cond, i32 %val, i32 %neg @@ -22,7 +22,7 @@ ; GCN: v_add_i32 ; EG: MAX_INT -define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { +define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { %val = load i32, i32 addrspace(1)* %src, align 4 %neg = sub i32 0, %val %cond = icmp sgt i32 %val, %neg @@ -36,7 +36,7 @@ ; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]] ; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[NEG]], [[SRC]] ; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MAX]], [[MAX]] -define void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { +define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { %val = load i32, i32 addrspace(1)* %src, align 4 %neg = sub i32 0, %val %cond = icmp sgt i32 %val, %neg @@ -54,7 +54,7 @@ ; EG: MAX_INT ; EG: MAX_INT -define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind { +define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind { %z0 = insertelement <2 x i32> undef, i32 0, i32 0 %z1 = insertelement <2 x i32> %z0, i32 0, i32 1 %t0 = insertelement <2 x i32> undef, i32 2, i32 0 @@ -79,7 +79,7 @@ ; EG: MAX_INT ; EG: MAX_INT -define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind { +define amdgpu_kernel void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind { %z0 = insertelement <2 x i32> undef, i32 0, i32 0 %z1 = insertelement <2 x i32> %z0, i32 0, i32 1 %t0 = insertelement <2 x i32> undef, i32 2, i32 0 @@ -109,7 +109,7 @@ ; EG: MAX_INT ; EG: MAX_INT ; EG: MAX_INT -define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind { +define amdgpu_kernel void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind { %z0 = insertelement <4 x i32> undef, i32 0, i32 0 %z1 = insertelement <4 x i32> %z0, i32 0, i32 1 %z2 = insertelement <4 x i32> %z1, i32 0, i32 2 @@ -146,7 +146,7 @@ ; EG: MAX_INT ; EG: MAX_INT ; EG: MAX_INT -define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind { +define amdgpu_kernel void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind { %z0 = insertelement <4 x i32> undef, i32 0, i32 0 %z1 = insertelement <4 x i32> %z0, i32 0, i32 1 %z2 = insertelement <4 x i32> %z1, i32 0, i32 2 @@ -170,7 +170,7 @@ ; GCN-DAG: s_min_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]] ; GCN-DAG: s_max_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]] -define void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind { +define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind { %cond0 = icmp sgt i32 %val0, %val1 %sel0 = select i1 %cond0, i32 %val0, i32 %val1 %sel1 = select i1 %cond0, i32 %val1, i32 %val0 @@ -186,7 +186,7 @@ ; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]] ; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]] -define void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind { +define amdgpu_kernel void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind { %val0 = load volatile i32, i32 addrspace(1)* %ptr0 %val1 = load volatile i32, i32 addrspace(1)* %ptr1 @@ -208,7 +208,7 @@ ; GCN-DAG: s_max_i32 ; GCN-DAG: s_max_i32 ; GCN-DAG: s_max_i32 -define void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind { +define amdgpu_kernel void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind { %cond0 = icmp sgt <4 x i32> %val0, %val1 %sel0 = select <4 x i1> %cond0, <4 x i32> %val0, <4 x i32> %val1 %sel1 = select <4 x i1> %cond0, <4 x i32> %val1, <4 x i32> %val0 @@ -223,7 +223,7 @@ ; GCN-DAG: v_cndmask_b32_e32 ; GCN-DAG: v_cndmask_b32_e32 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc -define void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind { +define amdgpu_kernel void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind { %val0 = load volatile i32, i32 addrspace(1)* %ptr0 %val1 = load volatile i32, i32 addrspace(1)* %ptr1 Index: test/CodeGen/AMDGPU/sminmax.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -17,7 +17,7 @@ ; CIVI: v_add_i32_e32 ; CIVI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, ; CIVI: v_or_b32_e32 -define void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 { +define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 { %neg = sub <2 x i16> zeroinitializer, %val %cond = icmp sgt <2 x i16> %val, %neg %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg @@ -41,7 +41,7 @@ ; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}} ; VI-NOT: v_and_b32 ; VI: v_or_b32_e32 -define void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 { +define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %src, i32 %tid %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -59,7 +59,7 @@ ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 -define void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 { +define amdgpu_kernel void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 { %z0 = insertelement <2 x i16> undef, i16 0, i16 0 %z1 = insertelement <2 x i16> %z0, i16 0, i16 1 %t0 = insertelement <2 x i16> undef, i16 2, i16 0 @@ -77,7 +77,7 @@ ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 -define void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 { +define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 { %z0 = insertelement <2 x i16> undef, i16 0, i16 0 %z1 = insertelement <2 x i16> %z0, i16 0, i16 1 %t0 = insertelement <2 x i16> undef, i16 2, i16 0 @@ -101,7 +101,7 @@ ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, [[VAL1]] ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], [[VAL1]], [[SUB1]] ; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 -define void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 { +define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 { %z0 = insertelement <4 x i16> undef, i16 0, i16 0 %z1 = insertelement <4 x i16> %z0, i16 0, i16 1 %z2 = insertelement <4 x i16> %z1, i16 0, i16 2 @@ -128,7 +128,7 @@ ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]] ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]] ; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 -define void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 { +define amdgpu_kernel void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 { %z0 = insertelement <4 x i16> undef, i16 0, i16 0 %z1 = insertelement <4 x i16> %z0, i16 0, i16 1 %z2 = insertelement <4 x i16> %z1, i16 0, i16 2 @@ -147,7 +147,7 @@ } ; GCN-LABEL: {{^}}s_min_max_v2i16: -define void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) #0 { +define amdgpu_kernel void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) #0 { %cond0 = icmp sgt <2 x i16> %val0, %val1 %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1 %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0 @@ -158,7 +158,7 @@ } ; GCN-LABEL: {{^}}v_min_max_v2i16: -define void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 { +define amdgpu_kernel void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 { %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0 %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1 @@ -172,7 +172,7 @@ } ; GCN-LABEL: {{^}}s_min_max_v4i32: -define void @s_min_max_v4i32(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 { +define amdgpu_kernel void @s_min_max_v4i32(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 { %cond0 = icmp sgt <4 x i16> %val0, %val1 %sel0 = select <4 x i1> %cond0, <4 x i16> %val0, <4 x i16> %val1 %sel1 = select <4 x i1> %cond0, <4 x i16> %val1, <4 x i16> %val0 @@ -183,7 +183,7 @@ } ; GCN-LABEL: {{^}}v_min_max_v2i16_user: -define void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 { +define amdgpu_kernel void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 { %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0 %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1 @@ -200,7 +200,7 @@ ; GCN-LABEL: {{^}}u_min_max_v2i16: ; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} ; GFX9: v_pk_min_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -define void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind { +define amdgpu_kernel void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind { %cond0 = icmp ugt <2 x i16> %val0, %val1 %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1 %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0 Index: test/CodeGen/AMDGPU/smrd-vccz-bug.ll =================================================================== --- test/CodeGen/AMDGPU/smrd-vccz-bug.ll +++ test/CodeGen/AMDGPU/smrd-vccz-bug.ll @@ -12,7 +12,7 @@ ; GCN: buffer_store_dword ; GCN: [[EXIT]]: ; GCN: s_endpgm -define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) { +define amdgpu_kernel void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) { entry: %cnd = fcmp oeq float 0.0, %cond %sgpr = load volatile i32, i32 addrspace(2)* %in @@ -32,7 +32,7 @@ ; GCN: buffer_store_dword ; GCN: [[EXIT]]: ; GCN: s_endpgm -define void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) { +define amdgpu_kernel void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) { entry: %vgpr = load volatile float, float addrspace(1)* %in %cnd = fcmp oeq float 0.0, %vgpr Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}smrd0: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 -define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 %tmp1 = load i32, i32 addrspace(2)* %tmp @@ -18,7 +18,7 @@ ; GCN-LABEL: {{^}}smrd1: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}} ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 %tmp1 = load i32, i32 addrspace(2)* %tmp @@ -33,7 +33,7 @@ ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 ; GCN: s_endpgm -define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 %tmp1 = load i32, i32 addrspace(2)* %tmp @@ -48,7 +48,7 @@ ; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b ; TODO: Add VI checks ; GCN: s_endpgm -define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 %tmp1 = load i32, i32 addrspace(2)* %tmp @@ -62,7 +62,7 @@ ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143 %tmp1 = load i32, i32 addrspace(2)* %tmp @@ -76,7 +76,7 @@ ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144 %tmp1 = load i32, i32 addrspace(2)* %tmp Index: test/CodeGen/AMDGPU/sopk-compares.ll =================================================================== --- test/CodeGen/AMDGPU/sopk-compares.ll +++ test/CodeGen/AMDGPU/sopk-compares.ll @@ -9,7 +9,7 @@ ; GCN-LABEL: {{^}}br_scc_eq_i32_inline_imm: ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 4{{$}} -define void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 4 br i1 %cmp0, label %endif, label %if @@ -25,7 +25,7 @@ ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max: ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x7fff{{$}} -define void @br_scc_eq_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 32767 br i1 %cmp0, label %endif, label %if @@ -41,7 +41,7 @@ ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max_p1: ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0x8000{{$}} -define void @br_scc_eq_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 32768 br i1 %cmp0, label %endif, label %if @@ -57,7 +57,7 @@ ; GCN-LABEL: {{^}}br_scc_ne_i32_simm16_max_p1: ; GCN: s_cmpk_lg_u32 s{{[0-9]+}}, 0x8000{{$}} -define void @br_scc_ne_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ne_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ne i32 %cond, 32768 br i1 %cmp0, label %endif, label %if @@ -73,7 +73,7 @@ ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min: ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x8000{{$}} -define void @br_scc_eq_i32_simm16_min(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_simm16_min(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, -32768 br i1 %cmp0, label %endif, label %if @@ -89,7 +89,7 @@ ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min_m1: ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0xffff7fff{{$}} -define void @br_scc_eq_i32_simm16_min_m1(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_simm16_min_m1(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, -32769 br i1 %cmp0, label %endif, label %if @@ -105,7 +105,7 @@ ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm15_max: ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}} -define void @br_scc_eq_i32_uimm15_max(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_uimm15_max(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 65535 br i1 %cmp0, label %endif, label %if @@ -121,7 +121,7 @@ ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max: ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}} -define void @br_scc_eq_i32_uimm16_max(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_uimm16_max(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 65535 br i1 %cmp0, label %endif, label %if @@ -137,7 +137,7 @@ ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max_p1: ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0x10000{{$}} -define void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 65536 br i1 %cmp0, label %endif, label %if @@ -154,7 +154,7 @@ ; GCN-LABEL: {{^}}br_scc_eq_i32: ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x41{{$}} -define void @br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 65 br i1 %cmp0, label %endif, label %if @@ -170,7 +170,7 @@ ; GCN-LABEL: {{^}}br_scc_ne_i32: ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x41{{$}} -define void @br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ne i32 %cond, 65 br i1 %cmp0, label %endif, label %if @@ -186,7 +186,7 @@ ; GCN-LABEL: {{^}}br_scc_sgt_i32: ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x41{{$}} -define void @br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp sgt i32 %cond, 65 br i1 %cmp0, label %endif, label %if @@ -202,7 +202,7 @@ ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max: ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x7fff{{$}} -define void @br_scc_sgt_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_sgt_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp sgt i32 %cond, 32767 br i1 %cmp0, label %endif, label %if @@ -218,7 +218,7 @@ ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max_p1: ; GCN: s_cmp_gt_i32 s{{[0-9]+}}, 0x8000{{$}} -define void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp sgt i32 %cond, 32768 br i1 %cmp0, label %endif, label %if @@ -234,7 +234,7 @@ ; GCN-LABEL: {{^}}br_scc_sge_i32: ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}} -define void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp sge i32 %cond, %size @@ -251,7 +251,7 @@ ; GCN-LABEL: {{^}}br_scc_slt_i32: ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x41{{$}} -define void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp slt i32 %cond, 65 br i1 %cmp0, label %endif, label %if @@ -267,7 +267,7 @@ ; GCN-LABEL: {{^}}br_scc_sle_i32: ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}} -define void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp sle i32 %cond, %size @@ -284,7 +284,7 @@ ; GCN-LABEL: {{^}}br_scc_ugt_i32: ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}} -define void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp ugt i32 %cond, %size @@ -301,7 +301,7 @@ ; GCN-LABEL: {{^}}br_scc_uge_i32: ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}} -define void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp uge i32 %cond, %size @@ -318,7 +318,7 @@ ; GCN-LABEL: {{^}}br_scc_ult_i32: ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x41{{$}} -define void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ult i32 %cond, 65 br i1 %cmp0, label %endif, label %if @@ -334,7 +334,7 @@ ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16: ; GCN: s_cmp_lt_u32 s2, 0xffff8000 -define void @br_scc_ult_i32_min_simm16(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ult_i32_min_simm16(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ult i32 %cond, -32768 br i1 %cmp0, label %endif, label %if @@ -350,7 +350,7 @@ ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16_m1: ; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 0xffff7fff{{$}} -define void @br_scc_ult_i32_min_simm16_m1(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ult_i32_min_simm16_m1(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ult i32 %cond, -32769 br i1 %cmp0, label %endif, label %if @@ -366,7 +366,7 @@ ; GCN-LABEL: {{^}}br_scc_ule_i32: ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}} -define void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp ule i32 %cond, %size @@ -383,7 +383,7 @@ ; GCN-LABEL: {{^}}commute_br_scc_eq_i32: ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp eq i32 %size, %cond @@ -400,7 +400,7 @@ ; GCN-LABEL: {{^}}commute_br_scc_ne_i32: ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp ne i32 %size, %cond @@ -417,7 +417,7 @@ ; GCN-LABEL: {{^}}commute_br_scc_sgt_i32: ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp sgt i32 %size, %cond @@ -434,7 +434,7 @@ ; GCN-LABEL: {{^}}commute_br_scc_sge_i32: ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp sge i32 %size, %cond @@ -451,7 +451,7 @@ ; GCN-LABEL: {{^}}commute_br_scc_slt_i32: ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp slt i32 %size, %cond @@ -468,7 +468,7 @@ ; GCN-LABEL: {{^}}commute_br_scc_sle_i32: ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp sle i32 %size, %cond @@ -485,7 +485,7 @@ ; GCN-LABEL: {{^}}commute_br_scc_ugt_i32: ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp ugt i32 %size, %cond @@ -502,7 +502,7 @@ ; GCN-LABEL: {{^}}commute_br_scc_uge_i32: ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp uge i32 %size, %cond @@ -519,7 +519,7 @@ ; GCN-LABEL: {{^}}commute_br_scc_ult_i32: ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp ult i32 %size, %cond @@ -536,7 +536,7 @@ ; GCN-LABEL: {{^}}commute_br_scc_ule_i32: ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp ule i32 %size, %cond @@ -553,7 +553,7 @@ ; GCN-LABEL: {{^}}br_scc_ult_i32_non_u16: ; GCN: s_cmp_lt_u32 s2, 0xfffff7ff -define void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %not.size = xor i32 %size, -1 @@ -573,7 +573,7 @@ ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 4 ; SI: v_cmp_eq_u64_e64 -define void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i64 %cond, 4 br i1 %cmp0, label %endif, label %if @@ -593,7 +593,7 @@ ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} ; SI: v_cmp_eq_u64_e32 -define void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i64 %cond, 1234 br i1 %cmp0, label %endif, label %if @@ -611,7 +611,7 @@ ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 4 ; SI: v_cmp_ne_u64_e64 -define void @br_scc_ne_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ne_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ne i64 %cond, 4 br i1 %cmp0, label %endif, label %if @@ -631,7 +631,7 @@ ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} ; SI: v_cmp_ne_u64_e32 -define void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ne i64 %cond, 1234 br i1 %cmp0, label %endif, label %if Index: test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll =================================================================== --- test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll +++ test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll @@ -4,7 +4,7 @@ ; allocate scratch registers correctly. Check that this test compiles without ; error. ; TONGA-LABEL: test -define void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) { entry: %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo) Index: test/CodeGen/AMDGPU/spill-cfg-position.ll =================================================================== --- test/CodeGen/AMDGPU/spill-cfg-position.ll +++ test/CodeGen/AMDGPU/spill-cfg-position.ll @@ -13,7 +13,7 @@ ; CHECK-NEXT: s_or_b64 exec ; CHECK: buffer_ -define void @spill_cfg_position(i32 addrspace(1)* nocapture %arg) { +define amdgpu_kernel void @spill_cfg_position(i32 addrspace(1)* nocapture %arg) { bb: %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp14 = load i32, i32 addrspace(1)* %arg, align 4 Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -43,7 +43,7 @@ ; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]] ; GCN: s_add_i32 s{{[0-9]+}}, m0, 1 -define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0 %cmp0 = icmp eq i32 %cond, 0 @@ -136,7 +136,7 @@ ; GCN-NOT: v_readlane_b32 m0 ; GCN-NOT: s_buffer_store_dword m0 ; GCN-NOT: s_buffer_load_dword m0 -define void @m0_unavailable_spill(i32 %m0.arg) #0 { +define amdgpu_kernel void @m0_unavailable_spill(i32 %m0.arg) #0 { main_body: %m0 = call i32 asm sideeffect "; def $0, 1", "={M0}"() #0 %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0.arg) @@ -189,7 +189,7 @@ ; TOSMEM: s_dcache_wb ; TOSMEM: s_endpgm -define void @restore_m0_lds(i32 %arg) { +define amdgpu_kernel void @restore_m0_lds(i32 %arg) { %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0 %sval = load volatile i64, i64 addrspace(2)* undef %cmp = icmp eq i32 %arg, 0 Index: test/CodeGen/AMDGPU/spill-scavenge-offset.ll =================================================================== --- test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -11,7 +11,7 @@ ; Just test that it compiles successfully. ; CHECK-LABEL: test -define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) { entry: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) Index: test/CodeGen/AMDGPU/split-scalar-i64-add.ll =================================================================== --- test/CodeGen/AMDGPU/split-scalar-i64-add.ll +++ test/CodeGen/AMDGPU/split-scalar-i64-add.ll @@ -10,7 +10,7 @@ ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0: ; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}} ; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc -define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) { +define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) { %v.val = load volatile i32, i32 addrspace(1)* %in %vec.0 = insertelement <2 x i32> undef, i32 %s.val, i32 0 %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1 @@ -23,7 +23,7 @@ ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_0: ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x18f ; SI: s_addc_u32 {{s[0-9]+}}, 0xf423f, 0 -define void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) { +define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) { %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0 %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1 %bc = bitcast <2 x i32> %vec.1 to i64 @@ -35,7 +35,7 @@ ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1: ; SI: v_add_i32 ; SI: v_addc_u32 -define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { +define amdgpu_kernel void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { %v.val = load volatile i32, i32 addrspace(1)* %in %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1 @@ -48,7 +48,7 @@ ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_1: ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; SI: s_addc_u32 {{s[0-9]+}}, 0x1869f, {{s[0-9]+}} -define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) { +define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) { %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1 %bc = bitcast <2 x i32> %vec.1 to i64 @@ -61,7 +61,7 @@ ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_2: ; SI: v_add_i32_e32 {{v[0-9]+}}, vcc, {{s[0-9]+}}, {{v[0-9]+}} ; SI: v_addc_u32_e32 {{v[0-9]+}}, vcc, {{v[0-9]+}}, {{v[0-9]+}}, vcc -define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { +define amdgpu_kernel void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %load = load i32, i32 addrspace(1)* %gep Index: test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll =================================================================== --- test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll +++ test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll @@ -29,7 +29,7 @@ ; GCN-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 ; GCN: s_endpgm -define void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 { +define amdgpu_kernel void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 { entry: %tmp = tail call i32 @llvm.r600.read.local.size.y() %tmp1 = tail call i32 @llvm.r600.read.local.size.z() Index: test/CodeGen/AMDGPU/splitkit.mir =================================================================== --- test/CodeGen/AMDGPU/splitkit.mir +++ test/CodeGen/AMDGPU/splitkit.mir @@ -1,7 +1,7 @@ # RUN: llc -o - %s -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s --- | - define void @func0() #0 { ret void } - define void @func1() #0 { ret void } + define amdgpu_kernel void @func0() #0 { ret void } + define amdgpu_kernel void @func1() #0 { ret void } attributes #0 = { "amdgpu-num-sgpr"="12" } ... Index: test/CodeGen/AMDGPU/sra.ll =================================================================== --- test/CodeGen/AMDGPU/sra.ll +++ test/CodeGen/AMDGPU/sra.ll @@ -13,7 +13,7 @@ ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr @@ -37,7 +37,7 @@ ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr @@ -51,7 +51,7 @@ ; global load we end up with the vector instructions rather than scalar. ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr @@ -67,7 +67,7 @@ ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr @@ -80,7 +80,7 @@ ; GCN: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8 ; EG: ASHR -define void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) { entry: %in.ext = sext i32 %in to i64 %ashr = ashr i64 %in.ext, 8 @@ -105,7 +105,7 @@ ; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} -define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { entry: %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 %a = load i64, i64 addrspace(1)* %in @@ -143,7 +143,7 @@ ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT -define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr @@ -156,7 +156,7 @@ ; XFUNC-LABEL: {{^}}s_ashr_v2i64: ; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}} ; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}} -; define void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) { +; define amdgpu_kernel void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) { ; %result = ashr <2 x i64> %a, %b ; store <2 x i64> %result, <2 x i64> addrspace(1)* %out ; ret void @@ -221,7 +221,7 @@ ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT -define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr @@ -235,7 +235,7 @@ ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31 ; GCN: s_add_u32 s{{[0-9]+}}, s[[HI]], s{{[0-9]+}} ; GCN: s_addc_u32 s{{[0-9]+}}, s[[SHIFT]], s{{[0-9]+}} -define void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %result = ashr i64 %a, 32 %add = add i64 %result, %b store i64 %add, i64 addrspace(1)* %out @@ -247,7 +247,7 @@ ; VI: flat_load_dword v[[HI:[0-9]+]] ; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]] ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[HI]]:[[SHIFT]]{{\]}} -define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid @@ -262,7 +262,7 @@ ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31 ; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}} ; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}} -define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %result = ashr i64 %a, 63 %add = add i64 %result, %b store i64 %add, i64 addrspace(1)* %out @@ -275,7 +275,7 @@ ; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]] ; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[SHIFT]] ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[SHIFT]]:[[COPY]]{{\]}} -define void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid Index: test/CodeGen/AMDGPU/srem.ll =================================================================== --- test/CodeGen/AMDGPU/srem.ll +++ test/CodeGen/AMDGPU/srem.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s -define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in %den = load i32, i32 addrspace(1) * %den_ptr @@ -11,7 +11,7 @@ ret void } -define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %num = load i32, i32 addrspace(1) * %in %result = srem i32 %num, 4 store i32 %result, i32 addrspace(1)* %out @@ -24,14 +24,14 @@ ; SI: v_mul_lo_i32 ; SI: v_sub_i32 ; SI: s_endpgm -define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %num = load i32, i32 addrspace(1) * %in %result = srem i32 %num, 7 store i32 %result, i32 addrspace(1)* %out ret void } -define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %num = load <2 x i32>, <2 x i32> addrspace(1) * %in %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr @@ -40,14 +40,14 @@ ret void } -define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %num = load <2 x i32>, <2 x i32> addrspace(1) * %in %result = srem <2 x i32> %num, store <2 x i32> %result, <2 x i32> addrspace(1)* %out ret void } -define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %num = load <4 x i32>, <4 x i32> addrspace(1) * %in %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr @@ -56,14 +56,14 @@ ret void } -define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %num = load <4 x i32>, <4 x i32> addrspace(1) * %in %result = srem <4 x i32> %num, store <4 x i32> %result, <4 x i32> addrspace(1)* %out ret void } -define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 %num = load i64, i64 addrspace(1) * %in %den = load i64, i64 addrspace(1) * %den_ptr @@ -72,14 +72,14 @@ ret void } -define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %num = load i64, i64 addrspace(1) * %in %result = srem i64 %num, 4 store i64 %result, i64 addrspace(1)* %out ret void } -define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 %num = load <2 x i64>, <2 x i64> addrspace(1) * %in %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr @@ -88,14 +88,14 @@ ret void } -define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %num = load <2 x i64>, <2 x i64> addrspace(1) * %in %result = srem <2 x i64> %num, store <2 x i64> %result, <2 x i64> addrspace(1)* %out ret void } -define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 %num = load <4 x i64>, <4 x i64> addrspace(1) * %in %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr @@ -104,7 +104,7 @@ ret void } -define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { %num = load <4 x i64>, <4 x i64> addrspace(1) * %in %result = srem <4 x i64> %num, store <4 x i64> %result, <4 x i64> addrspace(1)* %out Index: test/CodeGen/AMDGPU/srl.ll =================================================================== --- test/CodeGen/AMDGPU/srl.ll +++ test/CodeGen/AMDGPU/srl.ll @@ -8,7 +8,7 @@ ; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -26,7 +26,7 @@ ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr @@ -50,7 +50,7 @@ ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr @@ -74,7 +74,7 @@ ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]|PS}} ; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], [[SHIFT]] ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0 -define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 %a = load i64, i64 addrspace(1)* %in %b = load i64, i64 addrspace(1)* %b_ptr @@ -112,7 +112,7 @@ ; EG-DAG: CNDE_INT {{.*}}, 0.0 ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT -define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr @@ -178,7 +178,7 @@ ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT -define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr @@ -193,7 +193,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) { %result = lshr i64 %a, 32 store i64 %result, i64 addrspace(1)* %out ret void @@ -203,7 +203,7 @@ ; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}} -define void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid Index: test/CodeGen/AMDGPU/ssubo.ll =================================================================== --- test/CodeGen/AMDGPU/ssubo.ll +++ test/CodeGen/AMDGPU/ssubo.ll @@ -6,7 +6,7 @@ declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone ; FUNC-LABEL: {{^}}ssubo_i64_zext: -define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %ssub, 0 %carry = extractvalue { i64, i1 } %ssub, 1 @@ -17,7 +17,7 @@ } ; FUNC-LABEL: {{^}}s_ssubo_i32: -define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind %val = extractvalue { i32, i1 } %ssub, 0 %carry = extractvalue { i32, i1 } %ssub, 1 @@ -27,7 +27,7 @@ } ; FUNC-LABEL: {{^}}v_ssubo_i32: -define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind @@ -41,7 +41,7 @@ ; FUNC-LABEL: {{^}}s_ssubo_i64: ; SI: s_sub_u32 ; SI: s_subb_u32 -define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %ssub, 0 %carry = extractvalue { i64, i1 } %ssub, 1 @@ -53,7 +53,7 @@ ; FUNC-LABEL: {{^}}v_ssubo_i64: ; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 -define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %a = load i64, i64 addrspace(1)* %aptr, align 4 %b = load i64, i64 addrspace(1)* %bptr, align 4 %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind Index: test/CodeGen/AMDGPU/store-barrier.ll =================================================================== --- test/CodeGen/AMDGPU/store-barrier.ll +++ test/CodeGen/AMDGPU/store-barrier.ll @@ -12,7 +12,7 @@ ; CHECK: s_barrier ; CHECK: s_endpgm ; Function Attrs: nounwind -define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 { +define amdgpu_kernel void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 { bb: %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9 %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2 Index: test/CodeGen/AMDGPU/store-global.ll =================================================================== --- test/CodeGen/AMDGPU/store-global.ll +++ test/CodeGen/AMDGPU/store-global.ll @@ -11,7 +11,7 @@ ; CM-NOT: MEM_RAT MSKOR ; GCN: buffer_store_byte -define void @store_i1(i1 addrspace(1)* %out) { +define amdgpu_kernel void @store_i1(i1 addrspace(1)* %out) { entry: store i1 true, i1 addrspace(1)* %out ret void @@ -42,7 +42,7 @@ ; GCN: buffer_store_byte -define void @store_i8(i8 addrspace(1)* %out, i8 %in) { +define amdgpu_kernel void @store_i8(i8 addrspace(1)* %out, i8 %in) { entry: store i8 %in, i8 addrspace(1)* %out ret void @@ -75,7 +75,7 @@ ; EG: MOV * T[[RW_GPR]].Z, 0.0 ; GCN: buffer_store_short -define void @store_i16(i16 addrspace(1)* %out, i16 %in) { +define amdgpu_kernel void @store_i16(i16 addrspace(1)* %out, i16 %in) { entry: store i16 %in, i16 addrspace(1)* %out ret void @@ -88,7 +88,7 @@ ; EG: MEM_RAT MSKOR ; EG: MEM_RAT MSKOR -define void @store_i24(i24 addrspace(1)* %out, i24 %in) { +define amdgpu_kernel void @store_i24(i24 addrspace(1)* %out, i24 %in) { entry: store i24 %in, i24 addrspace(1)* %out ret void @@ -104,7 +104,7 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD ; CM-NOT: MEM_RAT -define void @store_i25(i25 addrspace(1)* %out, i25 %in) { +define amdgpu_kernel void @store_i25(i25 addrspace(1)* %out, i25 %in) { entry: store i25 %in, i25 addrspace(1)* %out ret void @@ -119,7 +119,7 @@ ; CM-NOT: MEM_RAT MSKOR ; GCN: buffer_store_short -define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i8> store <2 x i8> %0, <2 x i8> addrspace(1)* %out @@ -136,7 +136,7 @@ ; CM-NOT: MEM_RAT MSKOR ; SI: buffer_store_byte -define void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i8> store <2 x i8> %0, <2 x i8> addrspace(1)* %out, align 1 @@ -150,7 +150,7 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD ; GCN: buffer_store_dword -define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i16> store <2 x i16> %0, <2 x i16> addrspace(1)* %out @@ -170,7 +170,7 @@ ; SI: buffer_store_short ; SI: buffer_store_short -define void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i16> store <2 x i16> %0, <2 x i16> addrspace(1)* %out, align 2 @@ -183,7 +183,7 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD ; GCN: buffer_store_dword -define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> store <4 x i8> %0, <4 x i8> addrspace(1)* %out @@ -210,7 +210,7 @@ ; SI: buffer_store_byte ; SI: buffer_store_byte ; SI-NOT: buffer_store_dword -define void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 1 @@ -231,7 +231,7 @@ ; SI: buffer_store_short ; SI: buffer_store_short ; SI-NOT: buffer_store_dword -define void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 2 @@ -246,7 +246,7 @@ ; GCN: buffer_store_dword -define void @store_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @store_f32(float addrspace(1)* %out, float %in) { store float %in, float addrspace(1)* %out ret void } @@ -257,7 +257,7 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}} ; GCN: buffer_store_dwordx2 -define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i16> store <4 x i16> %0, <4 x i16> addrspace(1)* %out @@ -272,7 +272,7 @@ ; GCN: buffer_store_dwordx2 -define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { +define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { entry: %0 = insertelement <2 x float> , float %a, i32 0 %1 = insertelement <2 x float> %0, float %b, i32 1 @@ -286,7 +286,7 @@ ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XY}}, {{T[0-9]+\.[XYZW]}}, -define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind { +define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind { store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16 ret void } @@ -299,7 +299,7 @@ ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD ; GCN: buffer_store_dwordx4 -define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(1)* %out ret void @@ -313,7 +313,7 @@ ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD ; SI: buffer_store_dwordx4 -define void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 ret void @@ -328,7 +328,7 @@ ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD ; GCN: buffer_store_dwordx4 -define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %1 = load <4 x float>, <4 x float> addrspace(1) * %in store <4 x float> %1, <4 x float> addrspace(1)* %out ret void @@ -340,7 +340,7 @@ ; CM: MEM_RAT MSKOR ; GCN: buffer_store_byte -define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { entry: %0 = trunc i64 %in to i8 store i8 %0, i8 addrspace(1)* %out @@ -350,7 +350,7 @@ ; FUNC-LABEL: {{^}}store_i64_i16: ; EG: MEM_RAT MSKOR ; GCN: buffer_store_short -define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) { entry: %0 = trunc i64 %in to i16 store i16 %0, i16 addrspace(1)* %out @@ -369,7 +369,7 @@ ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD ; GCN: buffer_store_dwordx2 -define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { +define amdgpu_kernel void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { entry: %0 = load i32, i32 addrspace(2)* %mem, align 4 %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1 @@ -388,7 +388,7 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}, T{{[0-9]+}}.X ; GCN: buffer_store_dwordx4 -define void @i128-const-store(i32 addrspace(1)* %out) { +define amdgpu_kernel void @i128-const-store(i32 addrspace(1)* %out) { entry: store i32 1, i32 addrspace(1)* %out, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 Index: test/CodeGen/AMDGPU/store-local.ll =================================================================== --- test/CodeGen/AMDGPU/store-local.ll +++ test/CodeGen/AMDGPU/store-local.ll @@ -9,7 +9,7 @@ ; CM: LDS_BYTE_WRITE ; GCN: ds_write_b8 -define void @store_local_i1(i1 addrspace(3)* %out) { +define amdgpu_kernel void @store_local_i1(i1 addrspace(3)* %out) { entry: store i1 true, i1 addrspace(3)* %out ret void @@ -21,7 +21,7 @@ ; CM: LDS_BYTE_WRITE ; GCN: ds_write_b8 -define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { +define amdgpu_kernel void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { store i8 %in, i8 addrspace(3)* %out ret void } @@ -32,7 +32,7 @@ ; CM: LDS_SHORT_WRITE ; GCN: ds_write_b16 -define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { +define amdgpu_kernel void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { store i16 %in, i16 addrspace(3)* %out ret void } @@ -43,7 +43,7 @@ ; CM: LDS_WRITE ; GCN: ds_write_b32 -define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { +define amdgpu_kernel void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { entry: store <2 x i16> %in, <2 x i16> addrspace(3)* %out ret void @@ -55,7 +55,7 @@ ; CM: LDS_WRITE ; GCN: ds_write_b32 -define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { +define amdgpu_kernel void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(3)* %out ret void @@ -78,7 +78,7 @@ ; GCN: ds_write_b8 ; GCN: ds_write_b8 ; GCN: ds_write_b8 -define void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { +define amdgpu_kernel void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 1 ret void @@ -95,7 +95,7 @@ ; GCN: ds_write_b16 ; GCN: ds_write_b16 -define void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { +define amdgpu_kernel void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 2 ret void @@ -111,7 +111,7 @@ ; CM-NOT: LDS_WRITE ; GCN: ds_write_b64 -define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { entry: store <2 x i32> %in, <2 x i32> addrspace(3)* %out ret void @@ -129,7 +129,7 @@ ; CM: LDS_WRITE ; GCN: ds_write2_b64 -define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(3)* %out ret void @@ -148,7 +148,7 @@ ; GCN: ds_write2_b32 ; GCN: ds_write2_b32 -define void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4 ret void @@ -157,7 +157,7 @@ ; FUNC-LABEL: {{^}}store_local_i64_i8: ; EG: LDS_BYTE_WRITE ; GCN: ds_write_b8 -define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) { +define amdgpu_kernel void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) { entry: %0 = trunc i64 %in to i8 store i8 %0, i8 addrspace(3)* %out @@ -167,7 +167,7 @@ ; FUNC-LABEL: {{^}}store_local_i64_i16: ; EG: LDS_SHORT_WRITE ; GCN: ds_write_b16 -define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) { +define amdgpu_kernel void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) { entry: %0 = trunc i64 %in to i16 store i16 %0, i16 addrspace(3)* %out Index: test/CodeGen/AMDGPU/store-private.ll =================================================================== --- test/CodeGen/AMDGPU/store-private.ll +++ test/CodeGen/AMDGPU/store-private.ll @@ -15,7 +15,7 @@ ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; SI: buffer_store_byte -define void @store_i1(i1 addrspace(0)* %out) { +define amdgpu_kernel void @store_i1(i1 addrspace(0)* %out) { entry: store i1 true, i1 addrspace(0)* %out ret void @@ -44,7 +44,7 @@ ; SI: buffer_store_byte -define void @store_i8(i8 addrspace(0)* %out, i8 %in) { +define amdgpu_kernel void @store_i8(i8 addrspace(0)* %out, i8 %in) { entry: store i8 %in, i8 addrspace(0)* %out ret void @@ -72,7 +72,7 @@ ; EG: MOV * T(0 + AR.x).X+, [[RES]] ; SI: buffer_store_short -define void @store_i16(i16 addrspace(0)* %out, i16 %in) { +define amdgpu_kernel void @store_i16(i16 addrspace(0)* %out, i16 %in) { entry: store i16 %in, i16 addrspace(0)* %out ret void @@ -102,7 +102,7 @@ ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, ; CM: MOVA_INT ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, -define void @store_i24(i24 addrspace(0)* %out, i24 %in) { +define amdgpu_kernel void @store_i24(i24 addrspace(0)* %out, i24 %in) { entry: store i24 %in, i24 addrspace(0)* %out ret void @@ -120,7 +120,7 @@ ; CM: MOVA_INT ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; CM-NOT: MOVA_INT -define void @store_i25(i25 addrspace(0)* %out, i25 %in) { +define amdgpu_kernel void @store_i25(i25 addrspace(0)* %out, i25 %in) { entry: store i25 %in, i25 addrspace(0)* %out ret void @@ -141,7 +141,7 @@ ; CM-NOT: MOVA_INT ; SI: buffer_store_short -define void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i8> store <2 x i8> %0, <2 x i8> addrspace(0)* %out @@ -172,7 +172,7 @@ ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; SI: buffer_store_byte -define void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i8> store <2 x i8> %0, <2 x i8> addrspace(0)* %out, align 1 @@ -191,7 +191,7 @@ ; CM-NOT: MOVA_INT ; SI: buffer_store_dword -define void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i16> store <2 x i16> %0, <2 x i16> addrspace(0)* %out @@ -223,7 +223,7 @@ ; SI: buffer_store_short ; SI: buffer_store_short -define void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i16> store <2 x i16> %0, <2 x i16> addrspace(0)* %out, align 2 @@ -240,7 +240,7 @@ ; CM-NOT: MOVA_INT ; SI: buffer_store_dword -define void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> store <4 x i8> %0, <4 x i8> addrspace(0)* %out @@ -299,7 +299,7 @@ ; SI: buffer_store_byte ; SI: buffer_store_byte ; SI-NOT: buffer_store_dword -define void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 1 @@ -410,7 +410,7 @@ ; SI: buffer_store_byte ; SI: buffer_store_byte ; SI-NOT: buffer_store_dword -define void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) { +define amdgpu_kernel void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) { entry: %0 = trunc <8 x i32> %in to <8 x i8> store <8 x i8> %0, <8 x i8> addrspace(0)* %out, align 1 @@ -443,7 +443,7 @@ ; SI: buffer_store_short ; SI: buffer_store_short ; SI-NOT: buffer_store_dword -define void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 2 @@ -460,7 +460,7 @@ ; SI: buffer_store_dword -define void @store_f32(float addrspace(0)* %out, float %in) { +define amdgpu_kernel void @store_f32(float addrspace(0)* %out, float %in) { store float %in, float addrspace(0)* %out ret void } @@ -480,7 +480,7 @@ ; XSI: buffer_store_dwordx2 ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i16> store <4 x i16> %0, <4 x i16> addrspace(0)* %out @@ -504,7 +504,7 @@ ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) { +define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) { entry: %0 = insertelement <2 x float> , float %a, i32 0 %1 = insertelement <2 x float> %0, float %b, i32 1 @@ -533,7 +533,7 @@ ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind { +define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind { store <3 x i32> %a, <3 x i32> addrspace(0)* %out, align 16 ret void } @@ -563,7 +563,7 @@ ; SI: buffer_store_dword ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(0)* %out ret void @@ -594,7 +594,7 @@ ; SI: buffer_store_dword ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(0)* %out, align 4 ret void @@ -626,7 +626,7 @@ ; SI: buffer_store_dword ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) { +define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) { %1 = load <4 x float>, <4 x float> addrspace(0) * %in store <4 x float> %1, <4 x float> addrspace(0)* %out ret void @@ -644,7 +644,7 @@ ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; SI: buffer_store_byte -define void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) { +define amdgpu_kernel void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) { entry: %0 = trunc i64 %in to i8 store i8 %0, i8 addrspace(0)* %out @@ -663,7 +663,7 @@ ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; SI: buffer_store_short -define void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) { +define amdgpu_kernel void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) { entry: %0 = trunc i64 %in to i16 store i16 %0, i16 addrspace(0)* %out @@ -689,7 +689,7 @@ ; XSI: buffer_store_dwordx2 ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { +define amdgpu_kernel void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { entry: %0 = load i32, i32 addrspace(2)* %mem, align 4 %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1 @@ -727,7 +727,7 @@ ; SI: buffer_store_dword ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @i128-const-store(i32 addrspace(0)* %out) { +define amdgpu_kernel void @i128-const-store(i32 addrspace(0)* %out) { entry: store i32 1, i32 addrspace(0)* %out, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1 Index: test/CodeGen/AMDGPU/store-v3i64.ll =================================================================== --- test/CodeGen/AMDGPU/store-v3i64.ll +++ test/CodeGen/AMDGPU/store-v3i64.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}global_store_v3i64: ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { +define amdgpu_kernel void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32 ret void } @@ -40,7 +40,7 @@ ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { +define amdgpu_kernel void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1 ret void } @@ -48,7 +48,7 @@ ; GCN-LABEL: {{^}}local_store_v3i64: ; GCN: ds_write2_b64 ; GCN: ds_write_b64 -define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { +define amdgpu_kernel void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32 ret void } @@ -83,7 +83,7 @@ ; GCN: ds_write_b8 ; GCN: ds_write_b8 ; GCN: ds_write_b8 -define void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { +define amdgpu_kernel void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 1 ret void } @@ -91,7 +91,7 @@ ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32: ; GCN-DAG: buffer_store_dwordx2 ; GCN-DAG: buffer_store_dword v -define void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) { +define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) { %trunc = trunc <3 x i64> %x to <3 x i32> store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out ret void @@ -100,7 +100,7 @@ ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i16: ; GCN-DAG: buffer_store_short ; GCN-DAG: buffer_store_dword v -define void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) { +define amdgpu_kernel void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) { %trunc = trunc <3 x i64> %x to <3 x i16> store <3 x i16> %trunc, <3 x i16> addrspace(1)* %out ret void @@ -110,7 +110,7 @@ ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i8: ; GCN-DAG: buffer_store_short ; GCN-DAG: buffer_store_byte v -define void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) { +define amdgpu_kernel void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) { %trunc = trunc <3 x i64> %x to <3 x i8> store <3 x i8> %trunc, <3 x i8> addrspace(1)* %out ret void @@ -120,7 +120,7 @@ ; GCN-DAG: buffer_store_byte v ; GCN-DAG: buffer_store_byte v ; GCN-DAG: buffer_store_byte v -define void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) { +define amdgpu_kernel void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) { %trunc = trunc <3 x i64> %x to <3 x i1> store <3 x i1> %trunc, <3 x i1> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/store-vector-ptrs.ll =================================================================== --- test/CodeGen/AMDGPU/store-vector-ptrs.ll +++ test/CodeGen/AMDGPU/store-vector-ptrs.ll @@ -5,7 +5,7 @@ ; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting ; scratch loads and stores. ; CHECK-LABEL: {{^}}store_vector_ptrs: -define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind { +define amdgpu_kernel void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind { %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> store <4 x i32*> %p, <4 x i32*>* %out ret void Index: test/CodeGen/AMDGPU/store_typed.ll =================================================================== --- test/CodeGen/AMDGPU/store_typed.ll +++ test/CodeGen/AMDGPU/store_typed.ll @@ -6,7 +6,7 @@ ; EG: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}}, 1 ; CM: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}} -define void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) { +define amdgpu_kernel void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) { call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 0) ret void } @@ -16,7 +16,7 @@ ; EG: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}}, 1 ; CM: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}} -define void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) { +define amdgpu_kernel void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) { call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 11) ret void } Index: test/CodeGen/AMDGPU/structurize.ll =================================================================== --- test/CodeGen/AMDGPU/structurize.ll +++ test/CodeGen/AMDGPU/structurize.ll @@ -45,7 +45,7 @@ ; CHECK: CF_END -define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: %0 = icmp ne i32 %a, 0 br i1 %0, label %diamond_head, label %branch_from Index: test/CodeGen/AMDGPU/structurize1.ll =================================================================== --- test/CodeGen/AMDGPU/structurize1.ll +++ test/CodeGen/AMDGPU/structurize1.ll @@ -19,7 +19,7 @@ ; CHECK-LABEL: {{^}}if_inside_loop: ; CHECK: LOOP_START_DX10 ; CHECK: END_LOOP -define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +define amdgpu_kernel void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { entry: br label %for.body Index: test/CodeGen/AMDGPU/sub.i16.ll =================================================================== --- test/CodeGen/AMDGPU/sub.i16.ll +++ test/CodeGen/AMDGPU/sub.i16.ll @@ -7,7 +7,7 @@ ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -24,7 +24,7 @@ ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffff85, [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -39,7 +39,7 @@ ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x34d, [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -54,7 +54,7 @@ ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], 63, [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -70,7 +70,7 @@ ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: buffer_store_dword [[ADD]] -define void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -90,7 +90,7 @@ ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] ; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -110,7 +110,7 @@ ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: buffer_store_dword [[SEXT]] -define void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -131,7 +131,7 @@ ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -149,7 +149,7 @@ ; GCN-LABEL: {{^}}v_test_sub_i16_constant_commute: ; VI: v_subrev_u16_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}} ; CI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 0x800, v{{[0-9]+}} -define void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %size = call i32 @llvm.amdgcn.groupstaticsize() %size.trunc = trunc i32 %size to i16 call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) Index: test/CodeGen/AMDGPU/sub.ll =================================================================== --- test/CodeGen/AMDGPU/sub.ll +++ test/CodeGen/AMDGPU/sub.ll @@ -8,7 +8,7 @@ ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -25,7 +25,7 @@ ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr @@ -45,7 +45,7 @@ ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr @@ -55,7 +55,7 @@ } ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %a = load i16, i16 addrspace(1)* %in %b = load i16, i16 addrspace(1)* %b_ptr @@ -69,7 +69,7 @@ ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 %a = load <2 x i16>, <2 x i16> addrspace(1) * %in %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr @@ -85,7 +85,7 @@ ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 %a = load <4 x i16>, <4 x i16> addrspace(1) * %in %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr @@ -103,7 +103,7 @@ ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT ; EG-DAG: SUB_INT {{[* ]*}} -define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { %result = sub i64 %a, %b store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -118,7 +118,7 @@ ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT ; EG-DAG: SUB_INT {{[* ]*}} -define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { +define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() readnone %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid @@ -134,7 +134,7 @@ ; SI: v_subb_u32_e32 ; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 -define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { +define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { %tid = call i32 @llvm.r600.read.tidig.x() readnone %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid @@ -154,7 +154,7 @@ ; SI: v_subb_u32_e32 ; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 -define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { +define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { %tid = call i32 @llvm.r600.read.tidig.x() readnone %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid Index: test/CodeGen/AMDGPU/sub.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/sub.v2i16.ll +++ test/CodeGen/AMDGPU/sub.v2i16.ll @@ -7,7 +7,7 @@ ; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -27,7 +27,7 @@ ; VI: s_sub_i32 ; VI: s_sub_i32 -define void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 { +define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 { %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1 %add = sub <2 x i16> %a, %b @@ -38,7 +38,7 @@ ; GCN-LABEL: {{^}}s_test_sub_self_v2i16: ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]] ; GCN: buffer_store_dword [[ZERO]] -define void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 { +define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 { %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 %add = sub <2 x i16> %a, %a store <2 x i16> %add, <2 x i16> addrspace(1)* %out @@ -51,7 +51,7 @@ ; VI: v_subrev_i32_e32 ; VI: v_subrev_i32_e32 -define void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { +define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { %add = sub <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void @@ -63,7 +63,7 @@ ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffe38, v{{[0-9]+}} ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}} -define void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -80,7 +80,7 @@ ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x3df, v{{[0-9]+}} ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}} -define void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -99,7 +99,7 @@ ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]] ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_e32 -define void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -118,7 +118,7 @@ ; VI-NOT: v_subrev_i16 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_e32 -define void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -138,7 +138,7 @@ ; VI-NOT: v_subrev_i16 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_e32 -define void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -170,7 +170,7 @@ ; VI-NOT: and ; VI-NOT: shl ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} -define void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -205,7 +205,7 @@ ; VI-DAG: v_subrev_u16_e32 ; VI: buffer_store_dwordx4 -define void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -231,7 +231,7 @@ ; VI: v_subrev_u16_e32 ; VI: v_subrev_u16_e32 ; VI: buffer_store_dwordx2 -define void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -259,7 +259,7 @@ ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -define void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid Index: test/CodeGen/AMDGPU/subreg-coalescer-crash.ll =================================================================== --- test/CodeGen/AMDGPU/subreg-coalescer-crash.ll +++ test/CodeGen/AMDGPU/subreg-coalescer-crash.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL:{{^}}row_filter_C1_D0: -define void @row_filter_C1_D0() #0 { +define amdgpu_kernel void @row_filter_C1_D0() #0 { entry: br i1 undef, label %for.inc.1, label %do.body.preheader Index: test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll =================================================================== --- test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -20,7 +20,7 @@ ; CHECK-NEXT: s_mov_b32 s6, -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; CHECK-NEXT: s_endpgm -define void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { entry: %v0 = insertelement <4 x float> undef, float %a0, i32 0 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 Index: test/CodeGen/AMDGPU/subreg-eliminate-dead.ll =================================================================== --- test/CodeGen/AMDGPU/subreg-eliminate-dead.ll +++ test/CodeGen/AMDGPU/subreg-eliminate-dead.ll @@ -5,7 +5,7 @@ ; Just make sure this test doesn't crash. ; CHECK-LABEL: foobar: ; CHECK: s_endpgm -define void @foobar() { +define amdgpu_kernel void @foobar() { %v0 = icmp eq <4 x i32> undef, %v3 = sext <4 x i1> %v0 to <4 x i32> %v4 = extractelement <4 x i32> %v3, i32 1 Index: test/CodeGen/AMDGPU/subreg-intervals.mir =================================================================== --- test/CodeGen/AMDGPU/subreg-intervals.mir +++ test/CodeGen/AMDGPU/subreg-intervals.mir @@ -10,8 +10,8 @@ # CHECK-LABEL: Machine code for function test1: --- | - define void @test0() { ret void } - define void @test1() { ret void } + define amdgpu_kernel void @test0() { ret void } + define amdgpu_kernel void @test1() { ret void } ... --- name: test0 Index: test/CodeGen/AMDGPU/target-cpu.ll =================================================================== --- test/CodeGen/AMDGPU/target-cpu.ll +++ test/CodeGen/AMDGPU/target-cpu.ll @@ -14,7 +14,7 @@ ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]] ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 -define void @target_none() #0 { +define amdgpu_kernel void @target_none() #0 { %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024 %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)* @@ -30,7 +30,7 @@ ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]] ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 -define void @target_tahiti() #1 { +define amdgpu_kernel void @target_tahiti() #1 { %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024 %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)* @@ -46,7 +46,7 @@ ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 ; CHECK: s_dcache_inv_vol -define void @target_bonaire() #3 { +define amdgpu_kernel void @target_bonaire() #3 { %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024 %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)* @@ -63,7 +63,7 @@ ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x400 ; CHECK: flat_store_dword ; CHECK: s_dcache_wb{{$}} -define void @target_fiji() #4 { +define amdgpu_kernel void @target_fiji() #4 { %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024 %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)* @@ -79,7 +79,7 @@ ; CHECK-LABEL: {{^}}promote_alloca_enabled: ; CHECK: ds_read_b32 ; CHECK: ; LDSByteSize: 5120 -define void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 { +define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 { entry: %stack = alloca [5 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 @@ -93,7 +93,7 @@ ; CHECK: SCRATCH_RSRC_DWORD0 ; CHECK: SCRATCH_RSRC_DWORD1 ; CHECK: ScratchSize: 24 -define void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 { +define amdgpu_kernel void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 { entry: %stack = alloca [5 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 Index: test/CodeGen/AMDGPU/trap.ll =================================================================== --- test/CodeGen/AMDGPU/trap.ll +++ test/CodeGen/AMDGPU/trap.ll @@ -38,7 +38,7 @@ ; TRAP-BIT: enable_trap_handler = 1 ; NO-TRAP-BIT: enable_trap_handler = 0 ; NO-MESA-TRAP: s_endpgm -define void @hsa_trap() { +define amdgpu_kernel void @hsa_trap() { call void @llvm.trap() ret void } @@ -64,7 +64,7 @@ ; TRAP-BIT: enable_trap_handler = 1 ; NO-TRAP-BIT: enable_trap_handler = 0 ; NO-MESA-TRAP: s_endpgm -define void @hsa_debugtrap() { +define amdgpu_kernel void @hsa_debugtrap() { call void @llvm.debugtrap() ret void } @@ -75,7 +75,7 @@ ; NO-TRAP-BIT: enable_trap_handler = 0 ; NO-HSA-TRAP: s_endpgm ; NO-MESA-TRAP: s_endpgm -define void @trap() { +define amdgpu_kernel void @trap() { call void @llvm.trap() ret void } Index: test/CodeGen/AMDGPU/trunc-bitcast-vector.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-bitcast-vector.ll +++ test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32: ; CHECK: buffer_load_dword v ; CHECK: buffer_store_dword v -define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in %bc = bitcast <2 x i32> %ld to i64 %trunc = trunc i64 %bc to i32 @@ -15,7 +15,7 @@ ; CHECK-LABEL: {{^}}trunc_i96_bitcast_v3i32: ; CHECK: buffer_load_dword v ; CHECK: buffer_store_dword v -define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) { %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in %bc = bitcast <3 x i32> %ld to i96 %trunc = trunc i96 %bc to i32 @@ -26,7 +26,7 @@ ; CHECK-LABEL: {{^}}trunc_i128_bitcast_v4i32: ; CHECK: buffer_load_dword v ; CHECK: buffer_store_dword v -define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in %bc = bitcast <4 x i32> %ld to i128 %trunc = trunc i128 %bc to i32 @@ -38,7 +38,7 @@ ; CHECK-LABEL: {{^}}trunc_i16_bitcast_v2i16: ; CHECK: buffer_load_dword [[VAL:v[0-9]+]] ; CHECK: buffer_store_short [[VAL]] -define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in %bc = bitcast <2 x i16> %ld to i32 %trunc = trunc i32 %bc to i16 @@ -54,7 +54,7 @@ ; SI: buffer_load_dword v[[VAL:[0-9]+]] ; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]] ; CHECK: buffer_store_short [[VAL]] -define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in %bc = bitcast <4 x i16> %ld to i64 %trunc = trunc i64 %bc to i16 @@ -66,7 +66,7 @@ ; CHECK-LABEL: {{^}}trunc_i8_bitcast_v2i8: ; CHECK: buffer_load_ubyte [[VAL:v[0-9]+]] ; CHECK: buffer_store_byte [[VAL]] -define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in %bc = bitcast <2 x i8> %ld to i16 %trunc = trunc i16 %bc to i8 @@ -77,7 +77,7 @@ ; CHECK-LABEL: {{^}}trunc_i32_bitcast_v4i8: ; CHECK: buffer_load_dword [[VAL:v[0-9]+]] ; CHECK: buffer_store_byte [[VAL]] -define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in %bc = bitcast <4 x i8> %ld to i32 %trunc = trunc i32 %bc to i8 @@ -88,7 +88,7 @@ ; CHECK-LABEL: {{^}}trunc_i24_bitcast_v3i8: ; CHECK: buffer_load_dword [[VAL:v[0-9]+]] ; CHECK: buffer_store_byte [[VAL]] -define void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) { %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in %bc = bitcast <3 x i8> %ld to i24 %trunc = trunc i24 %bc to i8 Index: test/CodeGen/AMDGPU/trunc-cmp-constant.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-cmp-constant.ll +++ test/CodeGen/AMDGPU/trunc-cmp-constant.ll @@ -9,7 +9,7 @@ ; SI: v_cmp_eq_u32_e32 vcc, 0, [[TMP]]{{$}} ; SI: v_cndmask_b32_e64 ; SI: buffer_store_byte -define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 %cmp = icmp eq i32 %ext, 0 @@ -25,7 +25,7 @@ ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] ; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 %cmp = icmp eq i32 %ext, 0 @@ -36,7 +36,7 @@ ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1: ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; SI: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 %cmp = icmp eq i32 %ext, 1 @@ -48,7 +48,7 @@ ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] ; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 %cmp = icmp eq i32 %ext, 1 @@ -60,7 +60,7 @@ ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] ; SI: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 %cmp = icmp eq i32 %ext, -1 @@ -71,7 +71,7 @@ ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1: ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 %cmp = icmp eq i32 %ext, -1 @@ -84,7 +84,7 @@ ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] ; SI: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 %cmp = icmp ne i32 %ext, 0 @@ -96,7 +96,7 @@ ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] ; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 %cmp = icmp ne i32 %ext, 0 @@ -107,7 +107,7 @@ ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1: ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} ; SI: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 %cmp = icmp ne i32 %ext, 1 @@ -122,7 +122,7 @@ ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] ; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 %cmp = icmp ne i32 %ext, 1 @@ -137,7 +137,7 @@ ; XSI: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}} ; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]] ; XSI-NEXT: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 %cmp = icmp ne i32 %ext, -1 @@ -148,7 +148,7 @@ ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1: ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} ; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 %cmp = icmp ne i32 %ext, -1 @@ -162,7 +162,7 @@ ; SI: v_cmp_ne_u32_e32 vcc, -1, [[LOAD]]{{$}} ; SI-NEXT: v_cndmask_b32_e64 ; SI: {{buffer|flat}}_store_byte -define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr i8, i8 addrspace(1)* %in, i32 %tid.x %load = load i8, i8 addrspace(1)* %in.ptr Index: test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll +++ test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll @@ -2,7 +2,7 @@ ; GCN-LABEL: {{^}}global_truncstore_f64_to_f16: ; GCN: s_endpgm -define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 { %val = load double, double addrspace(1)* %in %cvt = fptrunc double %val to half store half %cvt, half addrspace(1)* %out @@ -11,7 +11,7 @@ ; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16: ; GCN: s_endpgm -define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { %val = load <2 x double>, <2 x double> addrspace(1)* %in %cvt = fptrunc <2 x double> %val to <2 x half> store <2 x half> %cvt, <2 x half> addrspace(1)* %out @@ -20,7 +20,7 @@ ; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16: ; GCN: s_endpgm -define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { %val = load <3 x double>, <3 x double> addrspace(1)* %in %cvt = fptrunc <3 x double> %val to <3 x half> store <3 x half> %cvt, <3 x half> addrspace(1)* %out @@ -29,7 +29,7 @@ ; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16: ; GCN: s_endpgm -define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { %val = load <4 x double>, <4 x double> addrspace(1)* %in %cvt = fptrunc <4 x double> %val to <4 x half> store <4 x half> %cvt, <4 x half> addrspace(1)* %out @@ -38,7 +38,7 @@ ; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16: ; GCN: s_endpgm -define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { %val = load <8 x double>, <8 x double> addrspace(1)* %in %cvt = fptrunc <8 x double> %val to <8 x half> store <8 x half> %cvt, <8 x half> addrspace(1)* %out @@ -47,7 +47,7 @@ ; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16: ; GCN: s_endpgm -define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { %val = load <16 x double>, <16 x double> addrspace(1)* %in %cvt = fptrunc <16 x double> %val to <16 x half> store <16 x half> %cvt, <16 x half> addrspace(1)* %out Index: test/CodeGen/AMDGPU/trunc-store-i1.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-store-i1.ll +++ test/CodeGen/AMDGPU/trunc-store-i1.ll @@ -7,7 +7,7 @@ ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] ; SI: buffer_store_byte [[VREG]], -define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind { +define amdgpu_kernel void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind { %trunc = trunc i32 %val to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 ret void @@ -15,7 +15,7 @@ ; SI-LABEL: {{^}}global_truncstore_i64_to_i1: ; SI: buffer_store_byte -define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind { +define amdgpu_kernel void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind { %trunc = trunc i64 %val to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 ret void @@ -26,13 +26,13 @@ ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] ; SI: buffer_store_byte [[VREG]], -define void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { +define amdgpu_kernel void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { %trunc = trunc i16 %val to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 ret void } ; SI-LABEL: {{^}}global_truncstore_i16_to_i1: -define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind { +define amdgpu_kernel void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind { %add = add i16 %val0, %val1 %trunc = trunc i16 %add to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 Index: test/CodeGen/AMDGPU/trunc-store.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-store.ll +++ test/CodeGen/AMDGPU/trunc-store.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8: ; SI: buffer_store_dwordx4 -define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) { +define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) { %trunc = trunc <16 x i32> %in to <16 x i8> store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out ret void @@ -11,7 +11,7 @@ ; FUNC-LABEL: {{^}}truncstore_arg_v16i64_to_v16i8: ; SI: buffer_store_dwordx4 -define void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) { +define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) { %trunc = trunc <16 x i64> %in to <16 x i8> store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll +++ test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll @@ -6,7 +6,7 @@ ; CHECK-LABEL: {{^}}test: ; CHECK: MEM_RAT_CACHELESS STORE_RAW -define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) { +define amdgpu_kernel void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) { entry: %0 = icmp eq i32 %cond, 0 br i1 %0, label %if, label %done Index: test/CodeGen/AMDGPU/trunc.ll =================================================================== --- test/CodeGen/AMDGPU/trunc.ll +++ test/CodeGen/AMDGPU/trunc.ll @@ -4,7 +4,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone -define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) { ; GCN-LABEL: {{^}}trunc_i64_to_i32_store: ; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]] @@ -28,7 +28,7 @@ ; SI: buffer_store_dword [[VSHL]] ; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]] -define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) { %b = shl i64 %a, 2 %result = trunc i64 %b to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -46,7 +46,7 @@ ; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]] ; GCN: v_mov_b32_e32 ; GCN: v_mov_b32_e32 -define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) { %aa = add i64 %a, 234 ; Prevent shrinking store. %b = shl i64 %aa, 2 %result = trunc i64 %b to i32 @@ -57,7 +57,7 @@ ; GCN-LABEL: {{^}}trunc_i32_to_i1: ; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}} -define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) { +define amdgpu_kernel void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) { %a = load i32, i32 addrspace(1)* %ptr, align 4 %trunc = trunc i32 %a to i1 %result = select i1 %trunc, i32 1, i32 0 @@ -67,7 +67,7 @@ ; GCN-LABEL: {{^}}trunc_i8_to_i1: ; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}} -define void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) { +define amdgpu_kernel void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) { %a = load i8, i8 addrspace(1)* %ptr, align 4 %trunc = trunc i8 %a to i1 %result = select i1 %trunc, i8 1, i8 0 @@ -77,7 +77,7 @@ ; GCN-LABEL: {{^}}sgpr_trunc_i16_to_i1: ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 -define void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) { +define amdgpu_kernel void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) { %trunc = trunc i16 %a to i1 %result = select i1 %trunc, i16 1, i16 0 store i16 %result, i16 addrspace(1)* %out, align 4 @@ -86,7 +86,7 @@ ; GCN-LABEL: {{^}}sgpr_trunc_i32_to_i1: ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 -define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) { %trunc = trunc i32 %a to i1 %result = select i1 %trunc, i32 1, i32 0 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -99,7 +99,7 @@ ; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]] ; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}} ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]] -define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) { +define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) { %trunc = trunc i64 %x to i1 %sel = select i1 %trunc, i32 63, i32 -12 store i32 %sel, i32 addrspace(1)* %out @@ -112,7 +112,7 @@ ; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]] ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]] ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc -define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid Index: test/CodeGen/AMDGPU/tti-unroll-prefs.ll =================================================================== --- test/CodeGen/AMDGPU/tti-unroll-prefs.ll +++ test/CodeGen/AMDGPU/tti-unroll-prefs.ll @@ -19,7 +19,7 @@ ; CHECK: store i8 0, i8 addrspace(1)* ; CHECK-NOT: store i8 0, i8 addrspace(1)* ; CHECK: ret void -define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) { entry: %add = add nsw i32 %b, 4 %cmp = icmp sgt i32 %add, %a Index: test/CodeGen/AMDGPU/uaddo.ll =================================================================== --- test/CodeGen/AMDGPU/uaddo.ll +++ test/CodeGen/AMDGPU/uaddo.ll @@ -9,7 +9,7 @@ ; EG: ADDC_UINT ; EG: ADDC_UINT -define void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 %carry = extractvalue { i64, i1 } %uadd, 1 @@ -27,7 +27,7 @@ ; EG: ADDC_UINT ; EG: ADD_INT -define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 { %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %uadd, 0 %carry = extractvalue { i32, i1 } %uadd, 1 @@ -42,7 +42,7 @@ ; EG: ADDC_UINT ; EG: ADD_INT -define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr @@ -63,7 +63,7 @@ ; EG: ADDC_UINT ; EG: ADD_INT -define void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr @@ -85,7 +85,7 @@ ; EG: ADDC_UINT ; EG: ADD_INT -define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 { %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 %carry = extractvalue { i64, i1 } %uadd, 1 @@ -100,7 +100,7 @@ ; EG: ADDC_UINT ; EG: ADD_INT -define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr @@ -118,7 +118,7 @@ ; FUNC-LABEL: {{^}}v_uaddo_i16: ; VI: v_add_u16_e32 ; VI: v_cmp_lt_u16_e32 -define void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr Index: test/CodeGen/AMDGPU/udiv.ll =================================================================== --- test/CodeGen/AMDGPU/udiv.ll +++ test/CodeGen/AMDGPU/udiv.ll @@ -10,7 +10,7 @@ ; EG: CF_END ; SI: v_rcp_iflag_f32_e32 -define void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -21,7 +21,7 @@ ; FUNC-LABEL: {{^}}s_udiv_i32: ; SI: v_rcp_iflag_f32_e32 -define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %result = udiv i32 %a, %b store i32 %result, i32 addrspace(1)* %out ret void @@ -38,7 +38,7 @@ ; SI: v_rcp_iflag_f32_e32 ; SI: v_rcp_iflag_f32_e32 ; SI: s_endpgm -define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr @@ -50,7 +50,7 @@ ; FUNC-LABEL: {{^}}udiv_v4i32: ; EG: CF_END ; SI: s_endpgm -define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr @@ -63,7 +63,7 @@ ; SI: buffer_load_dword [[VAL:v[0-9]+]] ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 4, [[VAL]] ; SI: buffer_store_dword [[RESULT]] -define void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %result = udiv i32 %a, 16 @@ -77,7 +77,7 @@ ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]] ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]] ; SI: buffer_store_dword [[RESULT]] -define void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %result = udiv i32 %a, 34259182 @@ -91,7 +91,7 @@ ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]] ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]] ; SI: buffer_store_dword [[RESULT]] -define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %result = udiv i32 %a, 34259183 @@ -103,7 +103,7 @@ ; SI: v_rcp_f32 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}} ; SI: buffer_store_dword [[TRUNC]] -define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 %num = load i8, i8 addrspace(1) * %in %den = load i8, i8 addrspace(1) * %den_ptr @@ -117,7 +117,7 @@ ; SI: v_rcp_f32 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}} ; SI: buffer_store_dword [[TRUNC]] -define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %num = load i16, i16 addrspace(1) * %in %den = load i16, i16 addrspace(1) * %den_ptr @@ -131,7 +131,7 @@ ; SI: v_rcp_f32 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}} ; SI: buffer_store_dword [[TRUNC]] -define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { +define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1 %num = load i23, i23 addrspace(1) * %in %den = load i23, i23 addrspace(1) * %den_ptr @@ -143,7 +143,7 @@ ; FUNC-LABEL: {{^}}v_udiv_i24: ; SI-NOT: v_rcp_f32 -define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { +define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1 %num = load i24, i24 addrspace(1) * %in %den = load i24, i24 addrspace(1) * %den_ptr @@ -159,7 +159,7 @@ ; SI: v_mul_hi_u32 ; SI: v_mul_hi_u32 -define void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { +define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 %2 = udiv <4 x i32> %1, store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16 @@ -168,7 +168,7 @@ ; FUNC-LABEL: {{^}}test_udiv2: ; SI: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1 -define void @test_udiv2(i32 %p) { +define amdgpu_kernel void @test_udiv2(i32 %p) { %i = udiv i32 %p, 2 store volatile i32 %i, i32 addrspace(1)* undef ret void @@ -178,7 +178,7 @@ ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab ; SI: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}} ; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -define void @test_udiv_3_mulhu(i32 %p) { +define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { %i = udiv i32 %p, 3 store volatile i32 %i, i32 addrspace(1)* undef ret void Index: test/CodeGen/AMDGPU/udivrem.ll =================================================================== --- test/CodeGen/AMDGPU/udivrem.ll +++ test/CodeGen/AMDGPU/udivrem.ll @@ -51,7 +51,7 @@ ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 ; SI: s_endpgm -define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) { +define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) { %result0 = udiv i32 %x, %y store i32 %result0, i32 addrspace(1)* %out0 %result1 = urem i32 %x, %y @@ -158,7 +158,7 @@ ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 ; SI: s_endpgm -define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { +define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { %result0 = udiv <2 x i32> %x, %y store <2 x i32> %result0, <2 x i32> addrspace(1)* %out %result1 = urem <2 x i32> %x, %y @@ -340,7 +340,7 @@ ; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI: s_endpgm -define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { +define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { %result0 = udiv <4 x i32> %x, %y store <4 x i32> %result0, <4 x i32> addrspace(1)* %out %result1 = urem <4 x i32> %x, %y Index: test/CodeGen/AMDGPU/udivrem24.ll =================================================================== --- test/CodeGen/AMDGPU/udivrem24.ll +++ test/CodeGen/AMDGPU/udivrem24.ll @@ -12,7 +12,7 @@ ; EG-DAG: UINT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_UINT -define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 %num = load i8, i8 addrspace(1) * %in %den = load i8, i8 addrspace(1) * %den_ptr @@ -31,7 +31,7 @@ ; EG-DAG: UINT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_UINT -define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %num = load i16, i16 addrspace(1) * %in, align 2 %den = load i16, i16 addrspace(1) * %den_ptr, align 2 @@ -50,7 +50,7 @@ ; EG-DAG: UINT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_UINT -define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -67,7 +67,7 @@ ; SI: v_rcp_iflag ; SI-NOT v_rcp_f32 ; EG-NOT: RECIP_IEEE -define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -84,7 +84,7 @@ ; SI: v_rcp_iflag ; SI-NOT v_rcp_f32 ; EG-NOT: RECIP_IEEE -define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -101,7 +101,7 @@ ; SI: v_rcp_iflag ; SI-NOT v_rcp_f32 ; EG-NOT: RECIP_IEEE -define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -121,7 +121,7 @@ ; EG-NOT: UINT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -141,7 +141,7 @@ ; EG-NOT: UINT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -161,7 +161,7 @@ ; EG-NOT: UINT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -184,7 +184,7 @@ ; EG-DAG: UINT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_UINT -define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 %num = load i8, i8 addrspace(1) * %in %den = load i8, i8 addrspace(1) * %den_ptr @@ -203,7 +203,7 @@ ; EG-DAG: UINT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_UINT -define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %num = load i16, i16 addrspace(1) * %in, align 2 %den = load i16, i16 addrspace(1) * %den_ptr, align 2 @@ -215,7 +215,7 @@ ; FUNC-LABEL: {{^}}urem24_i32: ; SI-NOT: v_rcp_f32 ; EG-NOT: RECIP_IEEE -define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -235,7 +235,7 @@ ; EG-NOT: UINT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -255,7 +255,7 @@ ; EG-NOT: UINT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -275,7 +275,7 @@ ; EG-NOT: UINT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -294,7 +294,7 @@ ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], ; EG: RECIP_IEEE -define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -313,7 +313,7 @@ ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], ; EG: RECIP_IEEE -define void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 Index: test/CodeGen/AMDGPU/udivrem64.ll =================================================================== --- test/CodeGen/AMDGPU/udivrem64.ll +++ test/CodeGen/AMDGPU/udivrem64.ll @@ -70,7 +70,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = udiv i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -144,7 +144,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -159,7 +159,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = lshr i64 %x, 33 %2 = lshr i64 %y, 33 %result = udiv i64 %1, %2 @@ -176,7 +176,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = lshr i64 %x, 33 %2 = lshr i64 %y, 33 %result = urem i64 %1, %2 @@ -195,7 +195,7 @@ ;VI-NOT: v_lshrrev_b64 ;GCN: v_mad_f32 ;GCN: s_endpgm -define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = lshr i64 %x, 41 %2 = lshr i64 %y, 41 %result = udiv i64 %1, %2 @@ -214,7 +214,7 @@ ;VI-NOT: v_lshrrev_b64 ;GCN: v_mad_f32 ;GCN: s_endpgm -define void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = lshr i64 %x, 41 %2 = lshr i64 %y, 41 %result = urem i64 %1, %2 Index: test/CodeGen/AMDGPU/uint_to_fp.f64.ll =================================================================== --- test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -9,7 +9,7 @@ ; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %val = load i64, i64 addrspace(1)* %gep, align 8 @@ -19,21 +19,21 @@ } ; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64 -define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { %cast = uitofp i64 %in to double store double %cast, double addrspace(1)* %out, align 8 ret void } ; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64 -define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) { +define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) { %cast = uitofp <2 x i64> %in to <2 x double> store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16 ret void } ; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64 -define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) { +define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) { %cast = uitofp <4 x i64> %in to <4 x double> store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16 ret void @@ -42,7 +42,7 @@ ; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64 ; SI: v_cvt_f64_u32_e32 ; SI: s_endpgm -define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { %cast = uitofp i32 %in to double store double %cast, double addrspace(1)* %out, align 8 ret void @@ -52,7 +52,7 @@ ; SI: v_cvt_f64_u32_e32 ; SI: v_cvt_f64_u32_e32 ; SI: s_endpgm -define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) { +define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) { %cast = uitofp <2 x i32> %in to <2 x double> store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16 ret void @@ -64,7 +64,7 @@ ; SI: v_cvt_f64_u32_e32 ; SI: v_cvt_f64_u32_e32 ; SI: s_endpgm -define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) { %cast = uitofp <4 x i32> %in to <4 x double> store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16 ret void @@ -79,7 +79,7 @@ ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}} ; SI: s_endpgm -define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { %cmp = icmp eq i32 %in, 0 %fp = uitofp i1 %cmp to double store double %fp, double addrspace(1)* %out, align 4 @@ -91,7 +91,7 @@ ; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]] ; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm -define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) { +define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) { %fp = uitofp i1 %in to double store double %fp, double addrspace(1)* %out, align 8 ret void Index: test/CodeGen/AMDGPU/uint_to_fp.i64.ll =================================================================== --- test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -4,7 +4,7 @@ ; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600 ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f16: -define void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 { %result = uitofp i64 %in to half store half %result, half addrspace(1)* %out ret void @@ -24,7 +24,7 @@ ; GCN: v_add_i32_e32 [[VR:v[0-9]+]] ; GCN: v_cvt_f16_f32_e32 [[VR_F16:v[0-9]+]], [[VR]] ; GCN: {{buffer|flat}}_store_short {{.*}}[[VR_F16]] -define void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -35,7 +35,7 @@ } ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f32: -define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { %result = uitofp i64 %in to float store float %result, float addrspace(1)* %out ret void @@ -54,7 +54,7 @@ ; GCN: v_add_i32_e32 [[VR:v[0-9]+]] ; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]] -define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -65,14 +65,14 @@ } ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f32: -define void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{ +define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{ %result = uitofp <2 x i64> %in to <2 x float> store <2 x float> %result, <2 x float> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f32: -define void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid @@ -83,14 +83,14 @@ } ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f16: -define void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{ +define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{ %result = uitofp <2 x i64> %in to <2 x half> store <2 x half> %result, <2 x half> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f16: -define void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid Index: test/CodeGen/AMDGPU/uint_to_fp.ll =================================================================== --- test/CodeGen/AMDGPU/uint_to_fp.ll +++ test/CodeGen/AMDGPU/uint_to_fp.ll @@ -6,7 +6,7 @@ ; SI: v_cvt_f32_u32_e32 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z -define void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 { %result = uitofp i32 %in to float store float %result, float addrspace(1)* %out ret void @@ -16,7 +16,7 @@ ; SI: v_cvt_f32_u32_e32 {{v[0-9]+}}, {{v[0-9]+$}} ; R600: INT_TO_FLT -define void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -32,7 +32,7 @@ ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X -define void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 { %result = uitofp <2 x i32> %in to <2 x float> store <2 x float> %result, <2 x float> addrspace(1)* %out ret void @@ -49,7 +49,7 @@ ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %value = load <4 x i32>, <4 x i32> addrspace(1) * %in %result = uitofp <4 x i32> %value to <4 x float> store <4 x float> %result, <4 x float> addrspace(1)* %out @@ -66,7 +66,7 @@ ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid @@ -81,7 +81,7 @@ ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 { %cmp = icmp eq i32 %in, 0 %fp = uitofp i1 %cmp to float store float %fp, float addrspace(1)* %out @@ -92,7 +92,7 @@ ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0 ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 { %fp = uitofp i1 %in to float store float %fp, float addrspace(1)* %out ret void @@ -105,7 +105,7 @@ ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0 ; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]] ; SI: s_endpgm -define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -126,7 +126,7 @@ ; R600-DAG: SETGT_UINT ; R600-DAG: SETE_INT -define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { entry: %cvt = uitofp i64 %in to float store float %cvt, float addrspace(1)* %out Index: test/CodeGen/AMDGPU/uitofp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/uitofp.f16.ll +++ test/CodeGen/AMDGPU/uitofp.f16.ll @@ -8,7 +8,7 @@ ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @uitofp_i16_to_f16( +define amdgpu_kernel void @uitofp_i16_to_f16( half addrspace(1)* %r, i16 addrspace(1)* %a) { entry: @@ -24,7 +24,7 @@ ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @uitofp_i32_to_f16( +define amdgpu_kernel void @uitofp_i32_to_f16( half addrspace(1)* %r, i32 addrspace(1)* %a) { entry: @@ -48,7 +48,7 @@ ; GCN-DAG: v_or_b32_e32 ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @uitofp_v2i16_to_v2f16( +define amdgpu_kernel void @uitofp_v2i16_to_v2f16( <2 x half> addrspace(1)* %r, <2 x i16> addrspace(1)* %a) { entry: @@ -68,7 +68,7 @@ ; GCN-DAG: v_or_b32_e32 ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @uitofp_v2i32_to_v2f16( +define amdgpu_kernel void @uitofp_v2i32_to_v2f16( <2 x half> addrspace(1)* %r, <2 x i32> addrspace(1)* %a) { entry: Index: test/CodeGen/AMDGPU/umed3.ll =================================================================== --- test/CodeGen/AMDGPU/umed3.ll +++ test/CodeGen/AMDGPU/umed3.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32: ; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 -define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -25,7 +25,7 @@ ; GCN-LABEL: {{^}}v_test_umed3_multi_use_r_i_i_i32: ; GCN: v_max_u32 ; GCN: v_min_u32 -define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -45,7 +45,7 @@ ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32: ; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} ; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} -define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -64,7 +64,7 @@ ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32: ; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} ; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} -define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -83,7 +83,7 @@ ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i64: ; GCN: v_cmp_lt_u64 ; GCN: v_cmp_gt_u64 -define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid @@ -102,7 +102,7 @@ ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16: ; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 -define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid @@ -173,7 +173,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -185,7 +185,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_1: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -197,7 +197,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_2: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -209,7 +209,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_3: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -221,7 +221,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_4: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -233,7 +233,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_5: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -245,7 +245,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_6: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -257,7 +257,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_7: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -269,7 +269,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_8: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -281,7 +281,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_9: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -293,7 +293,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_10: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -305,7 +305,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_11: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -317,7 +317,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_12: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -329,7 +329,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_13: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -341,7 +341,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_14: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -353,7 +353,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_15: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -368,7 +368,7 @@ ; GCN: s_and_b32 ; GCN: s_and_b32 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 { bb: %tmp0 = call i16 @umin16(i16 %x, i16 %y) %tmp1 = call i16 @umax16(i16 %x, i16 %y) @@ -383,7 +383,7 @@ ; GCN: s_and_b32 ; GCN: s_and_b32 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 { bb: %tmp0 = call i8 @umin8(i8 %x, i8 %y) %tmp1 = call i8 @umax8(i8 %x, i8 %y) @@ -395,7 +395,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_0: ; GCN-NOT: v_med3_u32 -define void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -408,7 +408,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_1: ; GCN-NOT: v_med3_u32 -define void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -421,7 +421,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_2: ; GCN-NOT: v_med3_u32 -define void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -434,7 +434,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_result: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -447,7 +447,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src0: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 1, i32 %y) %tmp1 = call i32 @umax(i32 1, i32 %y) @@ -459,7 +459,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src1: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 2) %tmp1 = call i32 @umax(i32 %x, i32 2) @@ -471,7 +471,7 @@ ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src2: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 9 -define void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -491,7 +491,7 @@ ; VI: v_max_u16 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 { +define amdgpu_kernel void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid Index: test/CodeGen/AMDGPU/unaligned-load-store.ll =================================================================== --- test/CodeGen/AMDGPU/unaligned-load-store.ll +++ test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -8,7 +8,7 @@ ; SI: ds_write_b8 ; SI: ds_write_b8 ; SI: s_endpgm -define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 { +define amdgpu_kernel void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 { %v = load i16, i16 addrspace(3)* %p, align 1 store i16 %v, i16 addrspace(3)* %r, align 1 ret void @@ -23,7 +23,7 @@ ; UNALIGNED: buffer_load_ushort ; UNALIGNED: buffer_store_short ; SI: s_endpgm -define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 { +define amdgpu_kernel void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 { %v = load i16, i16 addrspace(1)* %p, align 1 store i16 %v, i16 addrspace(1)* %r, align 1 ret void @@ -42,7 +42,7 @@ ; SI: ds_write_b8 ; SI: ds_write_b8 ; SI: s_endpgm -define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 { +define amdgpu_kernel void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 { %v = load i32, i32 addrspace(3)* %p, align 1 store i32 %v, i32 addrspace(3)* %r, align 1 ret void @@ -60,7 +60,7 @@ ; UNALIGNED: buffer_load_dword ; UNALIGNED: buffer_store_dword -define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 { +define amdgpu_kernel void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 { %v = load i32, i32 addrspace(1)* %p, align 1 store i32 %v, i32 addrspace(1)* %r, align 1 ret void @@ -74,7 +74,7 @@ ; UNALIGNED: buffer_load_dword ; UNALIGNED: buffer_store_dword -define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 { +define amdgpu_kernel void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 { %v = load i32, i32 addrspace(1)* %p, align 2 store i32 %v, i32 addrspace(1)* %r, align 2 ret void @@ -85,7 +85,7 @@ ; GCN: ds_read_u16 ; GCN: ds_write_b16 ; GCN: ds_write_b16 -define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 { +define amdgpu_kernel void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 { %v = load i32, i32 addrspace(3)* %p, align 2 store i32 %v, i32 addrspace(3)* %r, align 2 ret void @@ -132,7 +132,7 @@ ; SI-NOT: v_lshl ; SI: ds_write_b8 ; SI: s_endpgm -define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 { +define amdgpu_kernel void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 { %v = load i64, i64 addrspace(3)* %p, align 1 store i64 %v, i64 addrspace(3)* %r, align 1 ret void @@ -179,7 +179,7 @@ ; SI-NOT: v_lshl ; SI: ds_write_b8 ; SI: s_endpgm -define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 { +define amdgpu_kernel void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 { %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1 store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1 ret void @@ -209,7 +209,7 @@ ; UNALIGNED: buffer_load_dwordx2 ; UNALIGNED: buffer_store_dwordx2 -define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 { +define amdgpu_kernel void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 { %v = load i64, i64 addrspace(1)* %p, align 2 store i64 %v, i64 addrspace(1)* %r, align 2 ret void @@ -239,7 +239,7 @@ ; UNALIGNED: buffer_load_dwordx2 ; UNALIGNED: buffer_store_dwordx2 -define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 { +define amdgpu_kernel void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 { %v = load i64, i64 addrspace(1)* %p, align 1 store i64 %v, i64 addrspace(1)* %r, align 1 ret void @@ -286,7 +286,7 @@ ; GCN: ds_write_b8 ; GCN: ds_write_b8 ; GCN: s_endpgm -define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 { +define amdgpu_kernel void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 { %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1 store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1 ret void @@ -329,7 +329,7 @@ ; UNALIGNED: buffer_load_dwordx4 ; UNALIGNED: buffer_store_dwordx4 -define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 { +define amdgpu_kernel void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 { %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1 ret void @@ -337,7 +337,7 @@ ; FUNC-LABEL: {{^}}local_load_i64_align_4: ; GCN: ds_read2_b32 -define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { %val = load i64, i64 addrspace(3)* %in, align 4 store i64 %val, i64 addrspace(1)* %out, align 8 ret void @@ -345,7 +345,7 @@ ; FUNC-LABEL: {{^}}local_load_i64_align_4_with_offset ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9 -define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4 %val = load i64, i64 addrspace(3)* %ptr, align 4 store i64 %val, i64 addrspace(1)* %out, align 8 @@ -356,7 +356,7 @@ ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1 ; GCN: s_endpgm -define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)* %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* @@ -375,7 +375,7 @@ ; GCN: ds_read_u8 ; GCN: ds_read_u8 ; GCN: store_dwordx2 -define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { %val = load i64, i64 addrspace(3)* %in, align 1 store i64 %val, i64 addrspace(1)* %out, align 8 ret void @@ -383,7 +383,7 @@ ; FUNC-LABEL: {{^}}local_store_i64_align_4: ; GCN: ds_write2_b32 -define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { +define amdgpu_kernel void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { store i64 %val, i64 addrspace(3)* %out, align 4 ret void } @@ -391,7 +391,7 @@ ; FUNC-LABEL: {{^}}local_store_i64_align_4_with_offset ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9 ; GCN: s_endpgm -define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { +define amdgpu_kernel void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4 store i64 0, i64 addrspace(3)* %ptr, align 4 ret void @@ -401,7 +401,7 @@ ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1 ; GCN: s_endpgm -define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 { +define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 { %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)* %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* @@ -418,7 +418,7 @@ ; UNALIGNED: s_load_dword ; SI: buffer_store_dword -define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { %v = load i32, i32 addrspace(2)* %p, align 1 store i32 %v, i32 addrspace(1)* %r, align 4 ret void @@ -430,7 +430,7 @@ ; UNALIGNED: s_load_dword ; UNALIGNED: buffer_store_dword -define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { %v = load i32, i32 addrspace(2)* %p, align 2 store i32 %v, i32 addrspace(1)* %r, align 4 ret void @@ -444,7 +444,7 @@ ; UNALIGNED: s_load_dwordx2 ; UNALIGNED: buffer_store_dwordx2 -define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 { %v = load i64, i64 addrspace(2)* %p, align 2 store i64 %v, i64 addrspace(1)* %r, align 4 ret void @@ -453,7 +453,7 @@ ; SI-LABEL: {{^}}constant_align4_load_i64: ; SI: s_load_dwordx2 ; SI: buffer_store_dwordx2 -define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 { %v = load i64, i64 addrspace(2)* %p, align 4 store i64 %v, i64 addrspace(1)* %r, align 4 ret void @@ -462,7 +462,7 @@ ; SI-LABEL: {{^}}constant_align4_load_v4i32: ; SI: s_load_dwordx4 ; SI: buffer_store_dwordx4 -define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 { %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 4 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4 ret void @@ -482,7 +482,7 @@ ; UNALIGNED: buffer_load_dwordx2 ; SI: buffer_store_dwordx2 -define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 { %v = load <2 x i32>, <2 x i32> addrspace(2)* %p, align 1 store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4 ret void @@ -512,7 +512,7 @@ ; UNALIGNED: buffer_load_dwordx4 ; SI: buffer_store_dwordx4 -define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 { %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4 ret void @@ -521,7 +521,7 @@ ; SI-LABEL: {{^}}constant_align4_load_i8: ; SI: buffer_load_ubyte ; SI: buffer_store_byte -define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { %v = load i8, i8 addrspace(2)* %p, align 4 store i8 %v, i8 addrspace(1)* %r, align 4 ret void @@ -530,7 +530,7 @@ ; SI-LABEL: {{^}}constant_align2_load_i8: ; SI: buffer_load_ubyte ; SI: buffer_store_byte -define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { %v = load i8, i8 addrspace(2)* %p, align 2 store i8 %v, i8 addrspace(1)* %r, align 2 ret void @@ -541,7 +541,7 @@ ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]] ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]] ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { %gep0 = getelementptr i32, i32 addrspace(2)* %p, i64 1 %v0 = load i32, i32 addrspace(2)* %p, align 4 %v1 = load i32, i32 addrspace(2)* %gep0, align 4 @@ -571,7 +571,7 @@ ; SI: ds_read_u8 ; SI: ScratchSize: 0{{$}} -define void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 { %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 1 store <16 x i8> %ld, <16 x i8> addrspace(1)* %out ret void @@ -596,7 +596,7 @@ ; SI: ds_write_b8 ; SI: ScratchSize: 0{{$}} -define void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 { +define amdgpu_kernel void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 { store <16 x i8> zeroinitializer, <16 x i8> addrspace(3)* %out, align 1 ret void } Index: test/CodeGen/AMDGPU/undefined-subreg-liverange.ll =================================================================== --- test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: {{^}}func: -define void @func() #0 { +define amdgpu_kernel void @func() #0 { B0: br i1 undef, label %B1, label %B2 @@ -72,7 +72,7 @@ ; CHECK: v_mov_b32_e32 v[[OUTPUT_LO]], v6 ; CHECK: buffer_store_dwordx4 v{{\[}}[[OUTPUT_LO]]:[[OUTPUT_HI]]{{\]}} -define void @partially_undef_copy() #0 { +define amdgpu_kernel void @partially_undef_copy() #0 { %tmp0 = call i32 asm sideeffect "v_mov_b32_e32 v5, 5", "={VGPR5}"() %tmp1 = call i32 asm sideeffect "v_mov_b32_e32 v6, 6", "={VGPR6}"() Index: test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll =================================================================== --- test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll +++ test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll @@ -5,7 +5,7 @@ ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable. ; COMMON-LABEL: {{^}}branch_true: -define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { +define amdgpu_kernel void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { entry: br i1 true, label %for.end, label %for.body.lr.ph @@ -42,7 +42,7 @@ ; SI: s_cbranch_vccnz ; SI: s_cbranch_scc1 ; SI: s_endpgm -define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { +define amdgpu_kernel void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { entry: br i1 false, label %for.end, label %for.body.lr.ph @@ -79,7 +79,7 @@ ; SI: s_cbranch_scc1 ; SI: s_cbranch_scc1 ; SI: s_endpgm -define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { +define amdgpu_kernel void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { entry: br i1 undef, label %for.end, label %for.body.lr.ph Index: test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-cfg.ll +++ test/CodeGen/AMDGPU/uniform-cfg.ll @@ -12,7 +12,7 @@ ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %else @@ -40,7 +40,7 @@ ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) { entry: %cmp0 = fcmp oeq float %cond, 0.0 br i1 %cmp0, label %if, label %else @@ -68,7 +68,7 @@ ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %else, label %if @@ -96,7 +96,7 @@ ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) { entry: %cmp0 = fcmp oeq float %cond, 0.0 br i1 %cmp0, label %else, label %if @@ -123,7 +123,7 @@ ; GCN: buffer_store_dword ; GCN: [[ENDIF_LABEL]]: ; GCN: s_endpgm -define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) { +define amdgpu_kernel void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) { entry: %a.0 = fadd float %a, 10.0 %cond = bitcast float %a.0 to i32 @@ -148,7 +148,7 @@ ; GCN: buffer_store_dword ; GCN: [[ENDIF_LABEL]]: ; GCN: s_endpgm -define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) { +define amdgpu_kernel void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) { entry: %a.0 = fadd float %a, 10.0 %cond = bitcast float %a.0 to i32 @@ -176,7 +176,7 @@ ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] ; GCN: s_endpgm -define void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) { +define amdgpu_kernel void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) { entry: %cmp = icmp eq i32 %a, 0 br i1 %cmp, label %if.then, label %if.else @@ -209,7 +209,7 @@ ; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3 ; GCN: buffer_store_dword [[THREE]] ; GCN: s_endpgm -define void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) { +define amdgpu_kernel void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) { entry: %cmp = icmp eq i32 %a, 0 br i1 %cmp, label %if.then, label %if.else @@ -233,7 +233,7 @@ ; GCN: buffer_store_dword ; GCN: [[LABEL]]: ; GCN: s_endpgm -define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { main_body: %0 = icmp sgt i32 %cond, 0 %1 = sext i1 %0 to i32 @@ -258,7 +258,7 @@ ; GCN: {{^}}[[BODY]]: ; GCN: buffer_store ; GCN: s_endpgm -define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) { +define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %cmp0 = icmp sgt i32 %cond0, 0 @@ -284,7 +284,7 @@ ; SI: s_cmp_lg_u32 [[I]], 0 ; SI: s_cbranch_scc1 [[LOOP_LABEL]] ; SI: s_endpgm -define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @uniform_loop(i32 addrspace(1)* %out, i32 %a) { entry: br label %loop @@ -310,7 +310,7 @@ ; GCN: {{^}}[[IF_UNIFORM_LABEL]]: ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] -define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %d_cmp = icmp ult i32 %tid, 16 @@ -338,7 +338,7 @@ ; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] -define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) { entry: %u_cmp = icmp eq i32 %cond, 0 br i1 %u_cmp, label %if, label %endif @@ -370,7 +370,7 @@ ; GCN: [[IF_UNIFORM]]: ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 ; GCN: buffer_store_dword [[TWO]] -define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %d_cmp = icmp eq i32 %tid, 0 @@ -410,7 +410,7 @@ ; GCN: BB[[FNNUM]]_3: ; GCN: s_endpgm -define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp1 = icmp sgt i32 %cond, 0 @@ -445,7 +445,7 @@ ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp eq i64 %cond, 0 br i1 %cmp0, label %if, label %else @@ -477,7 +477,7 @@ ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp ne i64 %cond, 0 br i1 %cmp0, label %if, label %else @@ -505,7 +505,7 @@ ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp sgt i64 %cond, 0 br i1 %cmp0, label %if, label %else @@ -524,7 +524,7 @@ ; GCN-LABEL: {{^}}move_to_valu_i64_eq: ; GCN: v_cmp_eq_u64_e32 -define void @move_to_valu_i64_eq(i32 addrspace(1)* %out) { +define amdgpu_kernel void @move_to_valu_i64_eq(i32 addrspace(1)* %out) { %cond = load volatile i64, i64 addrspace(3)* undef %cmp0 = icmp eq i64 %cond, 0 br i1 %cmp0, label %if, label %else @@ -543,7 +543,7 @@ ; GCN-LABEL: {{^}}move_to_valu_i64_ne: ; GCN: v_cmp_ne_u64_e32 -define void @move_to_valu_i64_ne(i32 addrspace(1)* %out) { +define amdgpu_kernel void @move_to_valu_i64_ne(i32 addrspace(1)* %out) { %cond = load volatile i64, i64 addrspace(3)* undef %cmp0 = icmp ne i64 %cond, 0 br i1 %cmp0, label %if, label %else Index: test/CodeGen/AMDGPU/uniform-crash.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-crash.ll +++ test/CodeGen/AMDGPU/uniform-crash.ll @@ -6,7 +6,7 @@ ; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]] ; GCN: [[LABEL]]: ; GCN-NEXT: s_endpgm -define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { main_body: %0 = icmp sgt i32 %cond, 0 %1 = sext i1 %0 to i32 @@ -25,7 +25,7 @@ ; GCN: {{^}}[[LOOP:[A-Z0-9_]+]]: ; GCN: s_cbranch_scc1 [[LOOP]] ; GCN: {{^}}[[BB0]]: -define void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1) { +define amdgpu_kernel void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1) { bb: %cnd = trunc i32 %arg to i1 br i1 %cnd, label %bb2, label %bb5 Index: test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -38,7 +38,7 @@ ; CHECK-NEXT: s_xor_b64 ; CHECK-NEXT: ; mask branch ; CHECK-NEXT: s_cbranch_execz -define void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) { main_body: %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %cc = icmp eq i32 %tid, 0 Index: test/CodeGen/AMDGPU/unknown-processor.ll =================================================================== --- test/CodeGen/AMDGPU/unknown-processor.ll +++ test/CodeGen/AMDGPU/unknown-processor.ll @@ -13,7 +13,7 @@ ; GCN: ScratchSize: 8{{$}} ; R600: MOV -define void @foo() { +define amdgpu_kernel void @foo() { %alloca = alloca i32, align 4 store volatile i32 0, i32* %alloca ret void Index: test/CodeGen/AMDGPU/unroll.ll =================================================================== --- test/CodeGen/AMDGPU/unroll.ll +++ test/CodeGen/AMDGPU/unroll.ll @@ -9,7 +9,7 @@ ; CHECK-LABEL: @test ; CHECK-NOT: alloca ; CHECK: store i32 5, i32 addrspace(1)* %out -define void @test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out) { entry: %0 = alloca [32 x i32] br label %loop.header Index: test/CodeGen/AMDGPU/unsupported-cc.ll =================================================================== --- test/CodeGen/AMDGPU/unsupported-cc.ll +++ test/CodeGen/AMDGPU/unsupported-cc.ll @@ -6,7 +6,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 5(7.006492e-45) -define void @slt(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @slt(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp slt i32 %in, 5 %1 = select i1 %0, i32 -1, i32 0 @@ -18,7 +18,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 5(7.006492e-45) -define void @ult_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @ult_i32(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp ult i32 %in, 5 %1 = select i1 %0, i32 -1, i32 0 @@ -31,7 +31,7 @@ ; CHECK-NEXT: 1084227584(5.000000e+00) ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0 ; CHECK-NEXT: LSHR * -define void @ult_float(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @ult_float(float addrspace(1)* %out, float %in) { entry: %0 = fcmp ult float %in, 5.0 %1 = select i1 %0, float 1.0, float 0.0 @@ -43,7 +43,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}} ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @ult_float_native(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @ult_float_native(float addrspace(1)* %out, float %in) { entry: %0 = fcmp ult float %in, 5.0 %1 = select i1 %0, float 0.0, float 1.0 @@ -55,7 +55,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @olt(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @olt(float addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 5.0 %1 = select i1 %0, float 1.0, float 0.0 @@ -67,7 +67,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 6(8.407791e-45) -define void @sle(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @sle(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp sle i32 %in, 5 %1 = select i1 %0, i32 -1, i32 0 @@ -79,7 +79,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 6(8.407791e-45) -define void @ule_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @ule_i32(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp ule i32 %in, 5 %1 = select i1 %0, i32 -1, i32 0 @@ -92,7 +92,7 @@ ; CHECK-NEXT: 1084227584(5.000000e+00) ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0 ; CHECK-NEXT: LSHR * -define void @ule_float(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @ule_float(float addrspace(1)* %out, float %in) { entry: %0 = fcmp ule float %in, 5.0 %1 = select i1 %0, float 1.0, float 0.0 @@ -104,7 +104,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}} ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @ule_float_native(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @ule_float_native(float addrspace(1)* %out, float %in) { entry: %0 = fcmp ule float %in, 5.0 %1 = select i1 %0, float 0.0, float 1.0 @@ -116,7 +116,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT:1084227584(5.000000e+00) -define void @ole(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @ole(float addrspace(1)* %out, float %in) { entry: %0 = fcmp ole float %in, 5.0 %1 = select i1 %0, float 1.0, float 0.0 Index: test/CodeGen/AMDGPU/urem.ll =================================================================== --- test/CodeGen/AMDGPU/urem.ll +++ test/CodeGen/AMDGPU/urem.ll @@ -9,7 +9,7 @@ ; FUNC-LABEL: {{^}}test_urem_i32: ; SI: s_endpgm ; EG: CF_END -define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -26,7 +26,7 @@ ; SI: v_sub_i32 ; SI: buffer_store_dword ; SI: s_endpgm -define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %num = load i32, i32 addrspace(1) * %in %result = urem i32 %num, 7 store i32 %result, i32 addrspace(1)* %out @@ -36,7 +36,7 @@ ; FUNC-LABEL: {{^}}test_urem_v2i32: ; SI: s_endpgm ; EG: CF_END -define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr @@ -48,7 +48,7 @@ ; FUNC-LABEL: {{^}}test_urem_v4i32: ; SI: s_endpgm ; EG: CF_END -define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr @@ -60,7 +60,7 @@ ; FUNC-LABEL: {{^}}test_urem_i64: ; SI: s_endpgm ; EG: CF_END -define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 %a = load i64, i64 addrspace(1)* %in %b = load i64, i64 addrspace(1)* %b_ptr @@ -72,7 +72,7 @@ ; FUNC-LABEL: {{^}}test_urem_v2i64: ; SI: s_endpgm ; EG: CF_END -define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr @@ -84,7 +84,7 @@ ; FUNC-LABEL: {{^}}test_urem_v4i64: ; SI: s_endpgm ; EG: CF_END -define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr Index: test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll =================================================================== --- test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -11,7 +11,7 @@ ; GCN: s_load_dword [[SGPR:s[0-9]+]], ; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 { %dbl = fadd float %a, %a store float %dbl, float addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ ; GCN: s_load_dword [[SGPR:s[0-9]+]], ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -35,7 +35,7 @@ ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -58,7 +58,7 @@ ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] -define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 { %va0 = load volatile float, float addrspace(1)* %in %va1 = load volatile float, float addrspace(1)* %in %fma0 = call float @llvm.fma.f32(float %a, float %va0, float %b) #1 @@ -76,7 +76,7 @@ ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -90,7 +90,7 @@ ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -100,7 +100,7 @@ ; GCN: s_load_dword [[SGPR:s[0-9]+]] ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0 ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -110,7 +110,7 @@ ; GCN: s_load_dword [[SGPR:s[0-9]+]] ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -121,7 +121,7 @@ ; GCN: s_load_dword [[SGPR:s[0-9]+]] ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 { %val = call float @llvm.amdgcn.div.fixup.f32(float 2.0, float %a, float %a) #1 store float %val, float addrspace(1)* %out, align 4 ret void @@ -132,7 +132,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[VK]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float %a, float %a, float 1024.0) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -143,7 +143,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 ; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR]] ; GCN: buffer_store_dword [[RESULT0]] -define void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1 store float %fma, float addrspace(1)* %out ret void @@ -158,7 +158,7 @@ ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] ; GCN: s_endpgm -define void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 { %fma0 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1 %fma1 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %b) #1 store volatile float %fma0, float addrspace(1)* %out @@ -171,7 +171,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1 store float %fma, float addrspace(1)* %out ret void @@ -186,7 +186,7 @@ ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] ; GCN: s_endpgm -define void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 { %fma0 = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1 %fma1 = call float @llvm.fma.f32(float 1024.0, float %b, float 1024.0) #1 store volatile float %fma0, float addrspace(1)* %out @@ -199,7 +199,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1 store float %fma, float addrspace(1)* %out ret void @@ -214,7 +214,7 @@ ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] ; GCN: s_endpgm -define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 { %fma0 = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1 %fma1 = call float @llvm.fma.f32(float %b, float 1024.0, float 1024.0) #1 store volatile float %fma0, float addrspace(1)* %out @@ -234,7 +234,7 @@ ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] -define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 { %fma0 = call float @llvm.fma.f32(float %a, float %b, float 1024.0) #1 %fma1 = call float @llvm.fma.f32(float %a, float %b, float 4096.0) #1 store volatile float %fma0, float addrspace(1)* %out @@ -259,7 +259,7 @@ ; GCN: buffer_store_dwordx2 [[RESULT0]] ; GCN: buffer_store_dwordx2 [[RESULT1]] -define void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 { %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1 %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1 store volatile double %fma0, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/usubo.ll =================================================================== --- test/CodeGen/AMDGPU/usubo.ll +++ test/CodeGen/AMDGPU/usubo.ll @@ -9,7 +9,7 @@ ; EG: SUBB_UINT ; EG: ADDC_UINT -define void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0 %val = extractvalue { i64, i1 } %usub, 0 %carry = extractvalue { i64, i1 } %usub, 1 @@ -27,7 +27,7 @@ ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT -define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 { %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %usub, 0 %carry = extractvalue { i32, i1 } %usub, 1 @@ -42,7 +42,7 @@ ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT -define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr @@ -63,7 +63,7 @@ ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT -define void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr @@ -87,7 +87,7 @@ ; EG-DAG: SUB_INT ; EG-DAG: SUB_INT ; EG: SUB_INT -define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 { %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %usub, 0 %carry = extractvalue { i64, i1 } %usub, 1 @@ -104,7 +104,7 @@ ; EG-DAG: SUB_INT ; EG-DAG: SUB_INT ; EG: SUB_INT -define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr @@ -122,7 +122,7 @@ ; FUNC-LABEL: {{^}}v_usubo_i16: ; VI: v_subrev_u16_e32 ; VI: v_cmp_gt_u16_e32 -define void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr Index: test/CodeGen/AMDGPU/v1i64-kernel-arg.ll =================================================================== --- test/CodeGen/AMDGPU/v1i64-kernel-arg.ll +++ test/CodeGen/AMDGPU/v1i64-kernel-arg.ll @@ -1,14 +1,14 @@ ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s ; CHECK-LABEL: {{^}}kernel_arg_i64: -define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { +define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { store i64 %a, i64 addrspace(1)* %out, align 8 ret void } ; i64 arg works, v1i64 arg does not. ; CHECK-LABEL: {{^}}kernel_arg_v1i64: -define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { +define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 ret void } Index: test/CodeGen/AMDGPU/v_cndmask.ll =================================================================== --- test/CodeGen/AMDGPU/v_cndmask.ll +++ test/CodeGen/AMDGPU/v_cndmask.ll @@ -9,7 +9,7 @@ ; GCN-DAG: v{{[0-9]}} ; All nan values are converted to 0xffffffff ; GCN: s_endpgm -define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { +define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx %f = load float, float addrspace(1)* %f.gep @@ -30,7 +30,7 @@ ; GCN-DAG: v{{[0-9]}} ; All nan values are converted to 0xffffffff ; GCN: s_endpgm -define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 { +define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 { %setcc = icmp ne i32 %c, 0 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f store float %select, float addrspace(1)* %out @@ -47,7 +47,7 @@ ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc -define void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 { +define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext @@ -62,7 +62,7 @@ ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc -define void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext @@ -77,7 +77,7 @@ ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc -define void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 { +define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext @@ -92,7 +92,7 @@ ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc -define void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext @@ -107,7 +107,7 @@ ; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]] ; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]] -define void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext @@ -124,7 +124,7 @@ ; GCN-DAG: s_load_dword [[X:s[0-9]+]] ; GCN: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]] -define void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext @@ -142,7 +142,7 @@ ; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]] ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc -define void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 { +define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -159,7 +159,7 @@ ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]] ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc -define void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -178,7 +178,7 @@ ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]] ; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc -define void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext @@ -203,7 +203,7 @@ ; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}} ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s -define void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext @@ -226,7 +226,7 @@ ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc -define void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -249,7 +249,7 @@ ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc -define void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -275,7 +275,7 @@ ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc -define void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -298,7 +298,7 @@ ; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s ; GCN: store_byte -define void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext @@ -321,7 +321,7 @@ ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc -define void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -343,7 +343,7 @@ ; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc -define void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -364,7 +364,7 @@ ; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc -define void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext @@ -386,7 +386,7 @@ ; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]] ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc -define void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext Index: test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll =================================================================== --- test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll +++ test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_0: ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 0, v{{[0-9]+}} -define void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) { +define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) { %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_1: ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}} -define void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) { +define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) { %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %reg) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_2: ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}} -define void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) { +define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) { %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %reg) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -29,7 +29,7 @@ ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_3: ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}} -define void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) { +define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) { %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 3, i32 %reg) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -40,7 +40,7 @@ ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}} ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}} ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}} -define void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) { +define amdgpu_kernel void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) { %result0 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0 %result1 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %result0) #0 %result2 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %result1) #0 @@ -51,7 +51,7 @@ ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx: ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) { +define amdgpu_kernel void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) { %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 %idx, i32 %reg) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/v_mac.ll =================================================================== --- test/CodeGen/AMDGPU/v_mac.ll +++ test/CodeGen/AMDGPU/v_mac.ll @@ -8,7 +8,7 @@ ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8 ; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]] ; GCN: buffer_store_dword [[C]] -define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -26,7 +26,7 @@ ; GCN-LABEL: {{^}}mad_inline_sgpr_inline: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5 -define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 { entry: %tmp0 = fmul float 0.5, %in %tmp1 = fadd float %tmp0, 0.5 @@ -37,7 +37,7 @@ ; GCN-LABEL: {{^}}mad_vvs: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 { +define amdgpu_kernel void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 @@ -52,7 +52,7 @@ ; GCN-LABEL: {{^}}mac_ssv: ; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 { +define amdgpu_kernel void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 { entry: %c = load float, float addrspace(1)* %in @@ -65,7 +65,7 @@ ; GCN-LABEL: {{^}}mac_mad_same_add: ; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] ; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} -define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -96,7 +96,7 @@ ; GCN-LABEL: {{^}}mad_neg_src0: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -116,7 +116,7 @@ ; GCN-LABEL: {{^}}nsz_mad_sub0_src0: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -136,7 +136,7 @@ ; GCN-LABEL: {{^}}safe_mad_sub0_src0: ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0, ; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]] -define void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -156,7 +156,7 @@ ; GCN-LABEL: {{^}}mad_neg_src1: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -176,7 +176,7 @@ ; GCN-LABEL: {{^}}nsz_mad_sub0_src1: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -196,7 +196,7 @@ ; GCN-LABEL: {{^}}mad_neg_src2: ; GCN-NOT: v_mac ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} -define void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -222,7 +222,7 @@ ; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] ; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 -define void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 { +define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -257,7 +257,7 @@ ; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 -define void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 { +define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 Index: test/CodeGen/AMDGPU/v_mac_f16.ll =================================================================== --- test/CodeGen/AMDGPU/v_mac_f16.ll +++ test/CodeGen/AMDGPU/v_mac_f16.ll @@ -14,7 +14,7 @@ ; VI: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]] ; VI: buffer_store_short v[[C_F16]] ; GCN: s_endpgm -define void @mac_f16( +define amdgpu_kernel void @mac_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -38,7 +38,7 @@ ; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] ; VI: v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_f16_same_add( +define amdgpu_kernel void @mac_f16_same_add( half addrspace(1)* %r0, half addrspace(1)* %r1, half addrspace(1)* %a, @@ -73,7 +73,7 @@ ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_f16_neg_a( +define amdgpu_kernel void @mac_f16_neg_a( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -100,7 +100,7 @@ ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_f16_neg_b( +define amdgpu_kernel void @mac_f16_neg_b( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -127,7 +127,7 @@ ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_f16_neg_c( +define amdgpu_kernel void @mac_f16_neg_c( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -151,7 +151,7 @@ ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] ; GCN: s_endpgm -define void @mac_f16_neg_a_safe_fp_math( +define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -175,7 +175,7 @@ ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_f16_neg_b_safe_fp_math( +define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -199,7 +199,7 @@ ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_f16_neg_c_safe_fp_math( +define amdgpu_kernel void @mac_f16_neg_c_safe_fp_math( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -226,7 +226,7 @@ ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} ; GCN: s_endpgm -define void @mac_f16_neg_a_nsz_fp_math( +define amdgpu_kernel void @mac_f16_neg_a_nsz_fp_math( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -253,7 +253,7 @@ ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} ; GCN: s_endpgm -define void @mac_f16_neg_b_nsz_fp_math( +define amdgpu_kernel void @mac_f16_neg_b_nsz_fp_math( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -280,7 +280,7 @@ ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} ; GCN: s_endpgm -define void @mac_f16_neg_c_nsz_fp_math( +define amdgpu_kernel void @mac_f16_neg_c_nsz_fp_math( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -328,7 +328,7 @@ ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @mac_v2f16( +define amdgpu_kernel void @mac_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -355,7 +355,7 @@ ; VI: v_mac_f16_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mac_f16_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_v2f16_same_add( +define amdgpu_kernel void @mac_v2f16_same_add( <2 x half> addrspace(1)* %r0, <2 x half> addrspace(1)* %r1, <2 x half> addrspace(1)* %a, @@ -392,7 +392,7 @@ ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_v2f16_neg_a( +define amdgpu_kernel void @mac_v2f16_neg_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -421,7 +421,7 @@ ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_v2f16_neg_b( +define amdgpu_kernel void @mac_v2f16_neg_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -454,7 +454,7 @@ ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_v2f16_neg_c( +define amdgpu_kernel void @mac_v2f16_neg_c( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -482,7 +482,7 @@ ; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] ; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] ; GCN: s_endpgm -define void @mac_v2f16_neg_a_safe_fp_math( +define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -510,7 +510,7 @@ ; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} ; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_v2f16_neg_b_safe_fp_math( +define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -538,7 +538,7 @@ ; VI: v_mac_f16_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_v2f16_neg_c_safe_fp_math( +define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -571,7 +571,7 @@ ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} ; GCN: s_endpgm -define void @mac_v2f16_neg_a_nsz_fp_math( +define amdgpu_kernel void @mac_v2f16_neg_a_nsz_fp_math( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -604,7 +604,7 @@ ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} ; GCN: s_endpgm -define void @mac_v2f16_neg_b_nsz_fp_math( +define amdgpu_kernel void @mac_v2f16_neg_b_nsz_fp_math( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -637,7 +637,7 @@ ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} ; GCN: s_endpgm -define void @mac_v2f16_neg_c_nsz_fp_math( +define amdgpu_kernel void @mac_v2f16_neg_c_nsz_fp_math( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, Index: test/CodeGen/AMDGPU/v_madak_f16.ll =================================================================== --- test/CodeGen/AMDGPU/v_madak_f16.ll +++ test/CodeGen/AMDGPU/v_madak_f16.ll @@ -7,7 +7,7 @@ ; VI: v_madak_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}} ; VI: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @madak_f16( +define amdgpu_kernel void @madak_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -28,7 +28,7 @@ ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @madak_f16_use_2( +define amdgpu_kernel void @madak_f16_use_2( half addrspace(1)* %r0, half addrspace(1)* %r1, half addrspace(1)* %a, Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -29,7 +29,7 @@ ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]] ; SI-NEXT: ; mask branch ; -define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { +define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone switch i32 %tid, label %default [ @@ -76,7 +76,7 @@ ; SI: BB1_2: ; SI: s_or_b64 exec, exec, [[BR_SREG]] ; SI: s_endpgm -define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { +define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %is.0 = icmp ne i32 %tid, 0 br i1 %is.0, label %store, label %exit @@ -106,7 +106,7 @@ ; SI: [[LABEL_EXIT]]: ; SI: s_endpgm -define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { +define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %is.0 = icmp ne i32 %tid, 0 @@ -173,7 +173,7 @@ ; SI-NOT: [[COND_STATE]] ; SI: s_endpgm -define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { +define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp4 = sext i32 %tmp to i64 Index: test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir =================================================================== --- test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -1,7 +1,7 @@ # RUN: llc -run-pass si-insert-waits -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s --- | - define void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 { + define amdgpu_kernel void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = fcmp oeq float %cond, 0.000000e+00 br i1 %cmp0, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0 @@ -20,7 +20,7 @@ ret void } - define void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 { + define amdgpu_kernel void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 { entry: br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0 Index: test/CodeGen/AMDGPU/vector-alloca.ll =================================================================== --- test/CodeGen/AMDGPU/vector-alloca.ll +++ test/CodeGen/AMDGPU/vector-alloca.ll @@ -15,7 +15,7 @@ ; EG: MOV ; EG: MOV ; EG: MOVA_INT -define void @vector_read(i32 addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) { entry: %tmp = alloca [4 x i32] %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0 @@ -44,7 +44,7 @@ ; EG: MOV ; EG: MOVA_INT ; EG: MOVA_INT -define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { entry: %tmp = alloca [4 x i32] %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0 @@ -71,7 +71,7 @@ ; FUNC-LABEL: {{^}}bitcast_gep: ; EG: STORE_RAW -define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { entry: %tmp = alloca [4 x i32] %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0 @@ -93,7 +93,7 @@ ; OPT-LABEL: @vector_read_bitcast_gep( ; OPT: %0 = extractelement <4 x i32> , i32 %index ; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 -define void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) { entry: %tmp = alloca [4 x i32] %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 0 @@ -121,7 +121,7 @@ ; OPT: store float ; OPT: store float ; OPT: load float -define void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) { entry: %tmp = alloca [4 x i32] %tmp.bc = bitcast [4 x i32]* %tmp to [4 x float]* Index: test/CodeGen/AMDGPU/vector-extract-insert.ll =================================================================== --- test/CodeGen/AMDGPU/vector-extract-insert.ll +++ test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -13,7 +13,7 @@ ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOT: [[VVAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 { +define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext @@ -30,7 +30,7 @@ ; GCN: v_movreld_b32 ; GCN: v_movrels_b32 ; GCN: buffer_store_dword v -define void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 { +define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext @@ -49,7 +49,7 @@ ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOT: [[VVAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 { +define amdgpu_kernel void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext @@ -68,7 +68,7 @@ ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOT: [[VVAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 { +define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 %gep.in = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %id.ext Index: test/CodeGen/AMDGPU/vectorize-global-local.ll =================================================================== --- test/CodeGen/AMDGPU/vectorize-global-local.ll +++ test/CodeGen/AMDGPU/vectorize-global-local.ll @@ -12,7 +12,7 @@ ; CHECK-DAG: ds_write2_b32 ; CHECK-DAG: ds_write2_b32 -define void @vectorize_global_local(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(3)* nocapture %arg1) { +define amdgpu_kernel void @vectorize_global_local(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(3)* nocapture %arg1) { bb: %tmp = load i32, i32 addrspace(1)* %arg, align 4 store i32 %tmp, i32 addrspace(3)* %arg1, align 4 Index: test/CodeGen/AMDGPU/vertex-fetch-encoding.ll =================================================================== --- test/CodeGen/AMDGPU/vertex-fetch-encoding.ll +++ test/CodeGen/AMDGPU/vertex-fetch-encoding.ll @@ -6,7 +6,7 @@ ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00 -define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %v = load i32, i32 addrspace(1)* %in store i32 %v, i32 addrspace(1)* %out ret void @@ -16,7 +16,7 @@ ; EG: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00 ; CM: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x00,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x00,0x00 -define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %v = load <4 x i32>, <4 x i32> addrspace(1)* %in store <4 x i32> %v, <4 x i32> addrspace(1)* %out ret void @@ -26,7 +26,7 @@ ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00 -define void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) { +define amdgpu_kernel void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) { %v = load i32, i32 addrspace(7)* %in store i32 %v, i32 addrspace(1)* %out ret void @@ -38,7 +38,7 @@ @t = internal addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3] -define void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) { %a = getelementptr inbounds [4 x i32], [4 x i32] addrspace(2)* @t, i32 0, i32 %in %v = load i32, i32 addrspace(2)* %a store i32 %v, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -45,7 +45,7 @@ ; GCN: ScratchSize: 1536 ; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset. -define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 { +define amdgpu_kernel void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 { bb: %tmp = add i32 %arg1, %arg2 %tmp7 = extractelement <4 x float> %arg6, i32 0 Index: test/CodeGen/AMDGPU/vi-removed-intrinsics.ll =================================================================== --- test/CodeGen/AMDGPU/vi-removed-intrinsics.ll +++ test/CodeGen/AMDGPU/vi-removed-intrinsics.ll @@ -4,7 +4,7 @@ declare float @llvm.amdgcn.rsq.legacy(float) #0 -define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 { %rsq = call float @llvm.amdgcn.rsq.legacy(float %src), !dbg !4 store float %rsq, float addrspace(1)* %out, align 4 ret void Index: test/CodeGen/AMDGPU/vop-shrink.ll =================================================================== --- test/CodeGen/AMDGPU/vop-shrink.ll +++ test/CodeGen/AMDGPU/vop-shrink.ll @@ -8,7 +8,7 @@ ; ModuleID = 'vop-shrink.ll' -define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) { +define amdgpu_kernel void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) { entry: %vgpr = call i32 @llvm.amdgcn.workitem.id.x() #1 %tmp = icmp eq i32 %cond, 0 @@ -35,7 +35,7 @@ ; FUNC-LABEL: {{^}}add_fold: ; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000 -define void @add_fold(float addrspace(1)* %out) { +define amdgpu_kernel void @add_fold(float addrspace(1)* %out) { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = uitofp i32 %tmp to float Index: test/CodeGen/AMDGPU/vselect.ll =================================================================== --- test/CodeGen/AMDGPU/vselect.ll +++ test/CodeGen/AMDGPU/vselect.ll @@ -10,7 +10,7 @@ ; SI: v_cndmask_b32_e64 ; SI: v_cndmask_b32_e32 -define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) { +define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) { entry: %load0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0 %load1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1 @@ -28,7 +28,7 @@ ;SI: v_cndmask_b32_e64 ;SI: v_cndmask_b32_e32 -define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) { +define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) { entry: %0 = load <2 x float>, <2 x float> addrspace(1)* %in0 %1 = load <2 x float>, <2 x float> addrspace(1)* %in1 @@ -52,7 +52,7 @@ ; SI: v_cndmask_b32 ; SI: v_cndmask_b32 -define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) { +define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) { entry: %load0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0 %load1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1 @@ -68,7 +68,7 @@ ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) { +define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) { entry: %0 = load <4 x float>, <4 x float> addrspace(1)* %in0 %1 = load <4 x float>, <4 x float> addrspace(1)* %in1 Index: test/CodeGen/AMDGPU/vselect64.ll =================================================================== --- test/CodeGen/AMDGPU/vselect64.ll +++ test/CodeGen/AMDGPU/vselect64.ll @@ -5,7 +5,7 @@ ; Make sure the vectors aren't being stored on the stack. We know they are ; being stored on the stack if the shaders uses at leat 10 registers. ; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X -define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) { +define amdgpu_kernel void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) { entry: %cmp = icmp ne <4 x i32> %c, %result = select <4 x i1> %cmp, <4 x i64> , <4 x i64> Index: test/CodeGen/AMDGPU/vtx-fetch-branch.ll =================================================================== --- test/CodeGen/AMDGPU/vtx-fetch-branch.ll +++ test/CodeGen/AMDGPU/vtx-fetch-branch.ll @@ -10,7 +10,7 @@ ; CHECK-NOT: ALU_POP_AFTER ; CHECK: TEX ; CHECK-NEXT: POP -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { entry: %0 = icmp eq i32 %cond, 0 br i1 %0, label %endif, label %if Index: test/CodeGen/AMDGPU/vtx-schedule.ll =================================================================== --- test/CodeGen/AMDGPU/vtx-schedule.ll +++ test/CodeGen/AMDGPU/vtx-schedule.ll @@ -9,7 +9,7 @@ ; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0 ; CHECK: Fetch clause ; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0 -define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) { +define amdgpu_kernel void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) { entry: %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0 %1 = load i32, i32 addrspace(1)* %0 Index: test/CodeGen/AMDGPU/waitcnt-flat.ll =================================================================== --- test/CodeGen/AMDGPU/waitcnt-flat.ll +++ test/CodeGen/AMDGPU/waitcnt-flat.ll @@ -9,7 +9,7 @@ ; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]] ; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0) ; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}] -define void @test(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) { store volatile i32 0, i32 addrspace(1)* %out %val = load volatile i32, i32 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/waitcnt.mir =================================================================== --- test/CodeGen/AMDGPU/waitcnt.mir +++ test/CodeGen/AMDGPU/waitcnt.mir @@ -1,18 +1,18 @@ # RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-waits %s -o - | FileCheck %s --- | - define void @flat_zero_waitcnt(i32 addrspace(1)* %global4, + define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4, <4 x i32> addrspace(1)* %global16, i32 addrspace(4)* %flat4, <4 x i32> addrspace(4)* %flat16) { ret void } - define void @single_fallthrough_successor_no_end_block_wait() { + define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() { ret void } - define void @single_branch_successor_not_next_block() { + define amdgpu_kernel void @single_branch_successor_not_next_block() { ret void } Index: test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll =================================================================== --- test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll +++ test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 -define void @write_vgpr_into_sgpr() { +define amdgpu_kernel void @write_vgpr_into_sgpr() { %tid = call i32 @llvm.amdgcn.workitem.id.x() call void @llvm.write_register.i32(metadata !0, i32 %tid) ret void Index: test/CodeGen/AMDGPU/write_register.ll =================================================================== --- test/CodeGen/AMDGPU/write_register.ll +++ test/CodeGen/AMDGPU/write_register.ll @@ -4,7 +4,7 @@ declare void @llvm.write_register.i64(metadata, i64) #0 ; CHECK-LABEL: {{^}}test_write_m0: -define void @test_write_m0(i32 %val) #0 { +define amdgpu_kernel void @test_write_m0(i32 %val) #0 { call void @llvm.write_register.i32(metadata !0, i32 0) call void @llvm.write_register.i32(metadata !0, i32 -1) call void @llvm.write_register.i32(metadata !0, i32 %val) @@ -15,7 +15,7 @@ ; CHECK: s_mov_b64 exec, 0 ; CHECK: s_mov_b64 exec, -1 ; CHECK: s_mov_b64 exec, s{{\[[0-9]+:[0-9]+\]}} -define void @test_write_exec(i64 %val) #0 { +define amdgpu_kernel void @test_write_exec(i64 %val) #0 { call void @llvm.write_register.i64(metadata !1, i64 0) call void @llvm.write_register.i64(metadata !1, i64 -1) call void @llvm.write_register.i64(metadata !1, i64 %val) @@ -26,7 +26,7 @@ ; CHECK: s_mov_b64 flat_scratch, 0 ; CHECK: s_mov_b64 flat_scratch, -1 ; CHECK: s_mov_b64 flat_scratch, s{{\[[0-9]+:[0-9]+\]}} -define void @test_write_flat_scratch(i64 %val) #0 { +define amdgpu_kernel void @test_write_flat_scratch(i64 %val) #0 { call void @llvm.write_register.i64(metadata !2, i64 0) call void @llvm.write_register.i64(metadata !2, i64 -1) call void @llvm.write_register.i64(metadata !2, i64 %val) @@ -36,7 +36,7 @@ ; CHECK-LABEL: {{^}}test_write_flat_scratch_lo: ; CHECK: s_mov_b32 flat_scratch_lo, 0 ; CHECK: s_mov_b32 flat_scratch_lo, s{{[0-9]+}} -define void @test_write_flat_scratch_lo(i32 %val) #0 { +define amdgpu_kernel void @test_write_flat_scratch_lo(i32 %val) #0 { call void @llvm.write_register.i32(metadata !3, i32 0) call void @llvm.write_register.i32(metadata !3, i32 %val) ret void @@ -45,7 +45,7 @@ ; CHECK-LABEL: {{^}}test_write_flat_scratch_hi: ; CHECK: s_mov_b32 flat_scratch_hi, 0 ; CHECK: s_mov_b32 flat_scratch_hi, s{{[0-9]+}} -define void @test_write_flat_scratch_hi(i32 %val) #0 { +define amdgpu_kernel void @test_write_flat_scratch_hi(i32 %val) #0 { call void @llvm.write_register.i32(metadata !4, i32 0) call void @llvm.write_register.i32(metadata !4, i32 %val) ret void @@ -54,7 +54,7 @@ ; CHECK-LABEL: {{^}}test_write_exec_lo: ; CHECK: s_mov_b32 exec_lo, 0 ; CHECK: s_mov_b32 exec_lo, s{{[0-9]+}} -define void @test_write_exec_lo(i32 %val) #0 { +define amdgpu_kernel void @test_write_exec_lo(i32 %val) #0 { call void @llvm.write_register.i32(metadata !5, i32 0) call void @llvm.write_register.i32(metadata !5, i32 %val) ret void @@ -63,7 +63,7 @@ ; CHECK-LABEL: {{^}}test_write_exec_hi: ; CHECK: s_mov_b32 exec_hi, 0 ; CHECK: s_mov_b32 exec_hi, s{{[0-9]+}} -define void @test_write_exec_hi(i32 %val) #0 { +define amdgpu_kernel void @test_write_exec_hi(i32 %val) #0 { call void @llvm.write_register.i32(metadata !6, i32 0) call void @llvm.write_register.i32(metadata !6, i32 %val) ret void Index: test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll =================================================================== --- test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll +++ test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll @@ -4,7 +4,7 @@ ;CHECK: {{^}}fill3d: ;CHECK-NOT: MULLO_INT T[0-9]+ -define void @fill3d(i32 addrspace(1)* nocapture %out) #0 { +define amdgpu_kernel void @fill3d(i32 addrspace(1)* nocapture %out) #0 { entry: %x.i = tail call i32 @llvm.r600.read.global.size.x() #1 %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1 Index: test/CodeGen/AMDGPU/xfail.r600.bitcast.ll =================================================================== --- test/CodeGen/AMDGPU/xfail.r600.bitcast.ll +++ test/CodeGen/AMDGPU/xfail.r600.bitcast.ll @@ -5,7 +5,7 @@ ; TODO: enable doubles ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32: -define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) { %val = load double, double addrspace(1)* %in, align 8 %add = fadd double %val, 4.0 %bc = bitcast double %add to <2 x i32> @@ -14,7 +14,7 @@ } ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64: -define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) { +define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) { entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -30,7 +30,7 @@ } ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64: -define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) { +define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) { entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end Index: test/CodeGen/AMDGPU/xor.ll =================================================================== --- test/CodeGen/AMDGPU/xor.ll +++ test/CodeGen/AMDGPU/xor.ll @@ -10,7 +10,7 @@ ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { +define amdgpu_kernel void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0 %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1 %result = xor <2 x i32> %a, %b @@ -29,7 +29,7 @@ ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { +define amdgpu_kernel void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0 %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1 %result = xor <4 x i32> %a, %b @@ -46,7 +46,7 @@ ; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { +define amdgpu_kernel void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { %a = load float, float addrspace(1) * %in0 %b = load float, float addrspace(1) * %in1 %acmp = fcmp oge float %a, 0.000000e+00 @@ -63,7 +63,7 @@ ; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]] ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]] ; SI: buffer_store_byte [[RESULT]] -define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) { +define amdgpu_kernel void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) { %a = load volatile i1, i1 addrspace(1)* %in0 %b = load volatile i1, i1 addrspace(1)* %in1 %xor = xor i1 %a, %b @@ -73,7 +73,7 @@ ; FUNC-LABEL: {{^}}vector_xor_i32: ; SI: v_xor_b32_e32 -define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { +define amdgpu_kernel void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { %a = load i32, i32 addrspace(1)* %in0 %b = load i32, i32 addrspace(1)* %in1 %result = xor i32 %a, %b @@ -83,7 +83,7 @@ ; FUNC-LABEL: {{^}}scalar_xor_i32: ; SI: s_xor_b32 -define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %result = xor i32 %a, %b store i32 %result, i32 addrspace(1)* %out ret void @@ -91,7 +91,7 @@ ; FUNC-LABEL: {{^}}scalar_not_i32: ; SI: s_not_b32 -define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) { %result = xor i32 %a, -1 store i32 %result, i32 addrspace(1)* %out ret void @@ -99,7 +99,7 @@ ; FUNC-LABEL: {{^}}vector_not_i32: ; SI: v_not_b32 -define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { +define amdgpu_kernel void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { %a = load i32, i32 addrspace(1)* %in0 %b = load i32, i32 addrspace(1)* %in1 %result = xor i32 %a, -1 @@ -111,7 +111,7 @@ ; SI: v_xor_b32_e32 ; SI: v_xor_b32_e32 ; SI: s_endpgm -define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { +define amdgpu_kernel void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { %a = load i64, i64 addrspace(1)* %in0 %b = load i64, i64 addrspace(1)* %in1 %result = xor i64 %a, %b @@ -122,7 +122,7 @@ ; FUNC-LABEL: {{^}}scalar_xor_i64: ; SI: s_xor_b64 ; SI: s_endpgm -define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %result = xor i64 %a, %b store i64 %result, i64 addrspace(1)* %out ret void @@ -130,7 +130,7 @@ ; FUNC-LABEL: {{^}}scalar_not_i64: ; SI: s_not_b64 -define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) { %result = xor i64 %a, -1 store i64 %result, i64 addrspace(1)* %out ret void @@ -139,7 +139,7 @@ ; FUNC-LABEL: {{^}}vector_not_i64: ; SI: v_not_b32 ; SI: v_not_b32 -define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { +define amdgpu_kernel void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { %a = load i64, i64 addrspace(1)* %in0 %b = load i64, i64 addrspace(1)* %in1 %result = xor i64 %a, -1 @@ -153,7 +153,7 @@ ; FUNC-LABEL: {{^}}xor_cf: ; SI: s_xor_b64 -define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) { +define amdgpu_kernel void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) { entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else @@ -178,7 +178,7 @@ ; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]] ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]] -define void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) { %or = xor i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out ret void @@ -192,7 +192,7 @@ ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] -define void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %or = xor i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out @@ -211,7 +211,7 @@ ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]] ; SI-NOT: xor_b32 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { %or = xor i64 %a, 63 store i64 %or, i64 addrspace(1)* %out ret void @@ -220,7 +220,7 @@ ; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; SI: s_xor_b64 [[VAL]], [[VAL]], -8 -define void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { %or = xor i64 %a, -8 store i64 %or, i64 addrspace(1)* %out ret void @@ -231,7 +231,7 @@ ; SI: v_xor_b32_e32 {{v[0-9]+}}, -8, v[[LO_VREG]] ; SI: v_xor_b32_e32 {{v[0-9]+}}, -1, {{.*}} ; SI: s_endpgm -define void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %or = xor i64 %loada, -8 store i64 %or, i64 addrspace(1)* %out @@ -243,7 +243,7 @@ ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]] ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]] ; SI: s_endpgm -define void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %or = xor i64 %loada, 22470723082367 store i64 %or, i64 addrspace(1)* %out Index: test/CodeGen/AMDGPU/zero_extend.ll =================================================================== --- test/CodeGen/AMDGPU/zero_extend.ll +++ test/CodeGen/AMDGPU/zero_extend.ll @@ -9,7 +9,7 @@ ; SI: {{^}}s_mad_zext_i32_to_i64: ; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}} ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}} -define void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 { +define amdgpu_kernel void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 { entry: %tmp0 = mul i32 %a, %b %tmp1 = add i32 %tmp0, %c @@ -20,7 +20,7 @@ ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32 ; SI: v_cndmask_b32 -define void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %tmp0 = icmp eq i32 %a, %b %tmp1 = zext i1 %tmp0 to i32 @@ -29,7 +29,7 @@ } ; SI-LABEL: {{^}}s_arg_zext_i1_to_i64: -define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 { +define amdgpu_kernel void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 { %ext = zext i1 %arg to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 ret void @@ -39,7 +39,7 @@ ; SI: s_mov_b32 s{{[0-9]+}}, 0 ; SI: v_cmp_eq_u32 ; SI: v_cndmask_b32 -define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp eq i32 %a, %b %ext = zext i1 %cmp to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 @@ -49,7 +49,7 @@ ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; SI: buffer_store_short [[RESULT]] -define void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 { +define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 { %tmp0 = icmp eq i16 %a, %b %tmp1 = zext i1 %tmp0 to i16 store i16 %tmp1, i16 addrspace(1)* %out Index: test/CodeGen/AMDGPU/zext-i64-bit-operand.ll =================================================================== --- test/CodeGen/AMDGPU/zext-i64-bit-operand.ll +++ test/CodeGen/AMDGPU/zext-i64-bit-operand.ll @@ -11,7 +11,7 @@ ; GCN-NOT: v[[HI]] ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) { +define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) { %ld.64 = load volatile i64, i64 addrspace(1)* %in0 %ld.32 = load volatile i32, i32 addrspace(1)* %in1 %ext = zext i32 %ld.32 to i64 @@ -31,7 +31,7 @@ ; GCN-NOT: _or_ ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) { +define amdgpu_kernel void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) { %ld.64 = load volatile i64, i64 addrspace(1)* %in0 %ld.32 = load volatile i32, i32 addrspace(1)* %in1 %ext = zext i32 %ld.32 to i64 Index: test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir =================================================================== --- test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir +++ test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir @@ -6,7 +6,7 @@ @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4 - define void @float(float addrspace(1)* %out, i32 %index) #0 { + define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 { entry: %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index %1 = load float, float addrspace(2)* %0 Index: test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir =================================================================== --- test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir +++ test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir @@ -1,6 +1,6 @@ # RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s --- | - define void @add_f32_1.0_one_f16_use() #0 { + define amdgpu_kernel void @add_f32_1.0_one_f16_use() #0 { %f16.val0 = load volatile half, half addrspace(1)* undef %f16.val1 = load volatile half, half addrspace(1)* undef %f32.val = load volatile float, float addrspace(1)* undef @@ -11,7 +11,7 @@ ret void } - define void @add_f32_1.0_multi_f16_use() #0 { + define amdgpu_kernel void @add_f32_1.0_multi_f16_use() #0 { %f16.val0 = load volatile half, half addrspace(1)* undef %f16.val1 = load volatile half, half addrspace(1)* undef %f32.val = load volatile float, float addrspace(1)* undef @@ -22,7 +22,7 @@ ret void } - define void @add_f32_1.0_one_f32_use_one_f16_use () #0 { + define amdgpu_kernel void @add_f32_1.0_one_f32_use_one_f16_use () #0 { %f16.val0 = load volatile half, half addrspace(1)* undef %f16.val1 = load volatile half, half addrspace(1)* undef %f32.val = load volatile float, float addrspace(1)* undef @@ -33,7 +33,7 @@ ret void } - define void @add_f32_1.0_one_f32_use_multi_f16_use () #0 { + define amdgpu_kernel void @add_f32_1.0_one_f32_use_multi_f16_use () #0 { %f16.val0 = load volatile half, half addrspace(1)* undef %f16.val1 = load volatile half, half addrspace(1)* undef %f32.val = load volatile float, float addrspace(1)* undef @@ -46,7 +46,7 @@ ret void } - define void @add_i32_1_multi_f16_use() #0 { + define amdgpu_kernel void @add_i32_1_multi_f16_use() #0 { %f16.val0 = load volatile half, half addrspace(1)* undef %f16.val1 = load volatile half, half addrspace(1)* undef %f16.add0 = fadd half %f16.val0, 0xH0001 @@ -56,7 +56,7 @@ ret void } - define void @add_i32_m2_one_f32_use_multi_f16_use () #0 { + define amdgpu_kernel void @add_i32_m2_one_f32_use_multi_f16_use () #0 { %f16.val0 = load volatile half, half addrspace(1)* undef %f16.val1 = load volatile half, half addrspace(1)* undef %f32.val = load volatile float, float addrspace(1)* undef @@ -69,7 +69,7 @@ ret void } - define void @add_f16_1.0_multi_f32_use() #0 { + define amdgpu_kernel void @add_f16_1.0_multi_f32_use() #0 { %f32.val0 = load volatile float, float addrspace(1)* undef %f32.val1 = load volatile float, float addrspace(1)* undef %f32.val = load volatile float, float addrspace(1)* undef @@ -80,7 +80,7 @@ ret void } - define void @add_f16_1.0_other_high_bits_multi_f16_use() #0 { + define amdgpu_kernel void @add_f16_1.0_other_high_bits_multi_f16_use() #0 { %f16.val0 = load volatile half, half addrspace(1)* undef %f16.val1 = load volatile half, half addrspace(1)* undef %f32.val = load volatile half, half addrspace(1)* undef @@ -91,7 +91,7 @@ ret void } - define void @add_f16_1.0_other_high_bits_use_f16_f32() #0 { + define amdgpu_kernel void @add_f16_1.0_other_high_bits_use_f16_f32() #0 { %f16.val0 = load volatile half, half addrspace(1)* undef %f16.val1 = load volatile half, half addrspace(1)* undef %f32.val = load volatile half, half addrspace(1)* undef Index: test/CodeGen/MIR/AMDGPU/intrinsics.mir =================================================================== --- test/CodeGen/MIR/AMDGPU/intrinsics.mir +++ test/CodeGen/MIR/AMDGPU/intrinsics.mir @@ -2,7 +2,7 @@ --- | - define void @use_intrin() { + define amdgpu_kernel void @use_intrin() { ret void } Index: test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir =================================================================== --- test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir +++ test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir @@ -6,7 +6,7 @@ @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4 - define void @float(float addrspace(1)* %out, i32 %index) #0 { + define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 { entry: %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index %1 = load float, float addrspace(2)* %0 Index: test/CodeGen/MIR/AMDGPU/target-index-operands.mir =================================================================== --- test/CodeGen/MIR/AMDGPU/target-index-operands.mir +++ test/CodeGen/MIR/AMDGPU/target-index-operands.mir @@ -7,7 +7,7 @@ @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4 - define void @float(float addrspace(1)* %out, i32 %index) #0 { + define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 { entry: %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index %1 = load float, float addrspace(2)* %0 @@ -15,7 +15,7 @@ ret void } - define void @float2(float addrspace(1)* %out, i32 %index) #0 { + define amdgpu_kernel void @float2(float addrspace(1)* %out, i32 %index) #0 { entry: %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index %1 = load float, float addrspace(2)* %0 Index: test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll =================================================================== --- test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll +++ test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll @@ -5,7 +5,7 @@ ; ASC-NOT: ptrtoint ; ASC-NOT: inttoptr -define void @test_sink_ptrtoint_asc(float addrspace(1)* nocapture %arg, float addrspace(1)* nocapture readonly %arg1, float addrspace(3)* %arg2) #0 { +define amdgpu_kernel void @test_sink_ptrtoint_asc(float addrspace(1)* nocapture %arg, float addrspace(1)* nocapture readonly %arg1, float addrspace(3)* %arg2) #0 { bb: %tmp = getelementptr inbounds float, float addrspace(3)* %arg2, i32 16 %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x() #1 Index: test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll =================================================================== --- test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll +++ test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll @@ -14,7 +14,7 @@ ; CHECK-LABEL: @indvar_32_bit( ; CHECK-NOT: sext i32 ; CHECK: phi i32 -define void @indvar_32_bit(i32 %n, i32* nocapture %output) { +define amdgpu_kernel void @indvar_32_bit(i32 %n, i32* nocapture %output) { entry: %cmp5 = icmp sgt i32 %n, 0 br i1 %cmp5, label %for.body.preheader, label %for.end @@ -46,7 +46,7 @@ ; CHECK-NOT: ashr i64 ; CHECK-NOT: mul nsw i64 ; CHECK-NOT: add nsw i64 -define void @no_promote_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @no_promote_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: br label %for.body @@ -72,7 +72,7 @@ ; be legalized anyway. ; CHECK-LABEL: @indvar_48_bit( -define void @indvar_48_bit(i48 %n, i48* nocapture %output) { +define amdgpu_kernel void @indvar_48_bit(i48 %n, i48* nocapture %output) { entry: %cmp5 = icmp sgt i48 %n, 0 br i1 %cmp5, label %for.body.preheader, label %for.end Index: test/Transforms/InferAddressSpaces/AMDGPU/basic.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/basic.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/basic.ll @@ -45,7 +45,7 @@ ; CHECK-LABEL: @store_global_from_flat( ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)* ; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* %tmp0 -define void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 { +define amdgpu_kernel void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 { %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)* store float 0.0, float addrspace(1)* %tmp0 ret void @@ -54,7 +54,7 @@ ; CHECK-LABEL: @store_group_from_flat( ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)* ; CHECK-NEXT: store float 0.000000e+00, float addrspace(3)* %tmp0 -define void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 { +define amdgpu_kernel void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 { %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)* store float 0.0, float addrspace(3)* %tmp0 ret void @@ -63,7 +63,7 @@ ; CHECK-LABEL: @store_private_from_flat( ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float* ; CHECK-NEXT: store float 0.000000e+00, float* %tmp0 -define void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 { +define amdgpu_kernel void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 { %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float* store float 0.0, float* %tmp0 ret void @@ -74,7 +74,7 @@ ; CHECK-NEXT: %val = load i32, i32 addrspace(1)* %input, align 4 ; CHECK-NEXT: store i32 %val, i32 addrspace(1)* %output, align 4 ; CHECK-NEXT: ret void -define void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 { +define amdgpu_kernel void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 { %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)* %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)* %val = load i32, i32 addrspace(4)* %tmp0, align 4 @@ -87,7 +87,7 @@ ; CHECK-NEXT: %val = load i32, i32 addrspace(3)* %input, align 4 ; CHECK-NEXT: store i32 %val, i32 addrspace(3)* %output, align 4 ; CHECK-NEXT: ret void -define void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 { +define amdgpu_kernel void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 { %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)* %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)* %val = load i32, i32 addrspace(4)* %tmp0, align 4 @@ -100,7 +100,7 @@ ; CHECK-NEXT: %val = load i32, i32* %input, align 4 ; CHECK-NEXT: store i32 %val, i32* %output, align 4 ; CHECK-NEXT: ret void -define void @load_store_private(i32* nocapture %input, i32* nocapture %output) #0 { +define amdgpu_kernel void @load_store_private(i32* nocapture %input, i32* nocapture %output) #0 { %tmp0 = addrspacecast i32* %input to i32 addrspace(4)* %tmp1 = addrspacecast i32* %output to i32 addrspace(4)* %val = load i32, i32 addrspace(4)* %tmp0, align 4 @@ -113,7 +113,7 @@ ; CHECK-NEXT: %val = load i32, i32 addrspace(4)* %input, align 4 ; CHECK-NEXT: store i32 %val, i32 addrspace(4)* %output, align 4 ; CHECK-NEXT: ret void -define void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4)* nocapture %output) #0 { +define amdgpu_kernel void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4)* nocapture %output) #0 { %val = load i32, i32 addrspace(4)* %input, align 4 store i32 %val, i32 addrspace(4)* %output, align 4 ret void @@ -122,7 +122,7 @@ ; CHECK-LABEL: @store_addrspacecast_ptr_value( ; CHECK: %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)* ; CHECK-NEXT: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4 -define void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32 addrspace(4)* addrspace(1)* nocapture %output) #0 { +define amdgpu_kernel void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32 addrspace(4)* addrspace(1)* nocapture %output) #0 { %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)* store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4 ret void Index: test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll @@ -28,7 +28,7 @@ ; CHECK: store float %v, float addrspace(3)* %tmp7, align 4 ; CHECK: call void @llvm.amdgcn.s.barrier() ; CHECK: ret void -define void @load_store_lds_f32(i32 %i, float %v) #0 { +define amdgpu_kernel void @load_store_lds_f32(i32 %i, float %v) #0 { bb: %tmp = load float, float addrspace(4)* addrspacecast (float addrspace(3)* @scalar to float addrspace(4)*), align 4 call void @use(float %tmp) @@ -83,7 +83,7 @@ ; CHECK-LABEL: @nested_const_expr( ; CHECK: store i32 1, i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds ([10 x float], [10 x float] addrspace(3)* @array, i64 0, i64 1) to i32 addrspace(3)*), align 4 -define void @nested_const_expr() #0 { +define amdgpu_kernel void @nested_const_expr() #0 { store i32 1, i32 addrspace(4)* bitcast (float addrspace(4)* getelementptr ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i64 0, i64 1) to i32 addrspace(4)*), align 4 ret void } @@ -93,7 +93,7 @@ ; CHECK-NEXT: %v = load float, float addrspace(1)* %addr ; CHECK-NEXT: store float %v, float addrspace(1)* %addr ; CHECK-NEXT: ret void -define void @rauw(float addrspace(1)* %input) #0 { +define amdgpu_kernel void @rauw(float addrspace(1)* %input) #0 { bb: %generic_input = addrspacecast float addrspace(1)* %input to float addrspace(4)* %addr = getelementptr float, float addrspace(4)* %generic_input, i64 10 @@ -117,7 +117,7 @@ ; CHECK: %exit_cond = icmp eq float addrspace(3)* %i2, %end ; CHECK: br i1 %exit_cond, label %exit, label %loop -define void @loop() #0 { +define amdgpu_kernel void @loop() #0 { entry: %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)* %end = getelementptr float, float addrspace(4)* %p, i64 10 @@ -150,7 +150,7 @@ ; CHECK: %0 = addrspacecast float addrspace(3)* %i2 to float addrspace(4)* ; CHECK: %exit_cond = icmp eq float addrspace(4)* %0, %end ; CHECK: br i1 %exit_cond, label %exit, label %loop -define void @loop_with_generic_bound() #0 { +define amdgpu_kernel void @loop_with_generic_bound() #0 { entry: %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)* %end = load float addrspace(4)*, float addrspace(4)* addrspace(1)* @generic_end Index: test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: @memset_group_to_flat( ; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 -define void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 { +define amdgpu_kernel void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 { %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void @@ -10,7 +10,7 @@ ; CHECK-LABEL: @memset_global_to_flat( ; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 -define void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 { +define amdgpu_kernel void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 { %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void @@ -18,7 +18,7 @@ ; CHECK-LABEL: @memset_group_to_flat_no_md( ; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 %size, i32 4, i1 false){{$}} -define void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size) #0 { +define amdgpu_kernel void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size) #0 { %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false) ret void @@ -26,7 +26,7 @@ ; CHECK-LABEL: @memset_global_to_flat_no_md( ; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 %size, i32 4, i1 false){{$}} -define void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size) #0 { +define amdgpu_kernel void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size) #0 { %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false) ret void @@ -34,7 +34,7 @@ ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group( ; CHCK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 -define void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { +define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void @@ -42,7 +42,7 @@ ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_with_group( ; CHECK: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 -define void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size) #0 { +define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size) #0 { %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)* call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void @@ -50,7 +50,7 @@ ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_src_with_group( ; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %src.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 -define void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { +define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* %cast.dest = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 @@ -59,7 +59,7 @@ ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_group_src_global( ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 -define void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size) #0 { +define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size) #0 { %cast.src = addrspacecast i8 addrspace(1)* %src.global.ptr to i8 addrspace(4)* %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)* call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 @@ -68,7 +68,7 @@ ; CHECK-LABEL: @memcpy_group_to_flat_replace_dest_global( ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 -define void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size) #0 { +define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size) #0 { %cast.dest = addrspacecast i8 addrspace(1)* %dest.global.ptr to i8 addrspace(4)* call void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* %cast.dest, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void @@ -76,7 +76,7 @@ ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct( ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa.struct !7 -define void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { +define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa.struct !7 ret void @@ -84,7 +84,7 @@ ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_no_md( ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} -define void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { +define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false) ret void @@ -93,7 +93,7 @@ ; CHECK-LABEL: @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md( ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} -define void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest0, i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { +define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest0, i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false) call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false) @@ -103,14 +103,14 @@ ; Check for iterator problems if the pointer has 2 uses in the same call ; CHECK-LABEL: @memcpy_group_flat_to_flat_self( ; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 addrspace(3)* %group.ptr, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 -define void @memcpy_group_flat_to_flat_self(i8 addrspace(3)* %group.ptr) #0 { +define amdgpu_kernel void @memcpy_group_flat_to_flat_self(i8 addrspace(3)* %group.ptr) #0 { %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast, i8 addrspace(4)* %cast, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void } ; CHECK-LABEL: @memmove_flat_to_flat_replace_src_with_group( ; CHECK: call void @llvm.memmove.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 -define void @memmove_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { +define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* call void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 ret void Index: test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll @@ -9,7 +9,7 @@ ; CHECK-LABEL: @generic_address_bitcast_const( ; CHECK: %vecload1 = load <2 x double>, <2 x double> addrspace(1)* bitcast (double addrspace(1)* getelementptr inbounds ([100 x double], [100 x double] addrspace(1)* @data, i64 0, i64 4) to <2 x double> addrspace(1)*), align 8 -define void @generic_address_bitcast_const(i64 %arg0, i32 addrspace(1)* nocapture %results) #0 { +define amdgpu_kernel void @generic_address_bitcast_const(i64 %arg0, i32 addrspace(1)* nocapture %results) #0 { entry: %tmp1 = call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = zext i32 %tmp1 to i64 @@ -39,7 +39,7 @@ ; CHECK: %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)* ; CHECK: %add.ptr = getelementptr inbounds i32, i32 addrspace(3)* %tmp1, i32 2 ; CHECK: %tmp2 = load i32, i32 addrspace(3)* %add.ptr, align 4 -define void @generic_address_pipe_bug9673(%opencl.pipe_t addrspace(3)* nocapture %in_pipe, i32 addrspace(1)* nocapture %dst) #0 { +define amdgpu_kernel void @generic_address_pipe_bug9673(%opencl.pipe_t addrspace(3)* nocapture %in_pipe, i32 addrspace(1)* nocapture %dst) #0 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)* @@ -55,7 +55,7 @@ ; CHECK: br i1 ; CHECK: load float, float addrspace(4)* ; CHECK: br label -define void @generic_address_bug9749(i32 addrspace(1)* nocapture %results) #0 { +define amdgpu_kernel void @generic_address_bug9749(i32 addrspace(1)* nocapture %results) #0 { entry: %ptr = alloca float addrspace(4)*, align 8 %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -85,7 +85,7 @@ ; CHECK-LABEL: @generic_address_opt_phi_bug9776_simple_phi_kernel( ; CHECK: phi i32 addrspace(3)* ; CHECK: store i32 %i.03, i32 addrspace(3)* % -define void @generic_address_opt_phi_bug9776_simple_phi_kernel(i32 addrspace(3)* nocapture %in, i32 %numElems) #0 { +define amdgpu_kernel void @generic_address_opt_phi_bug9776_simple_phi_kernel(i32 addrspace(3)* nocapture %in, i32 %numElems) #0 { entry: %cmp1 = icmp eq i32 %numElems, 0 br i1 %cmp1, label %for.end, label %for.body.lr.ph @@ -110,7 +110,7 @@ ; CHECK-LABEL: @generic_address_bug9899( ; CHECK: %vecload = load <2 x i32>, <2 x i32> addrspace(3)* ; CHECK: store <2 x i32> %tmp16, <2 x i32> addrspace(3)* -define void @generic_address_bug9899(i64 %arg0, i32 addrspace(3)* nocapture %sourceA, i32 addrspace(3)* nocapture %destValues) #0 { +define amdgpu_kernel void @generic_address_bug9899(i64 %arg0, i32 addrspace(3)* nocapture %sourceA, i32 addrspace(3)* nocapture %destValues) #0 { entry: %tmp1 = call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = zext i32 %tmp1 to i64 Index: test/Transforms/InferAddressSpaces/AMDGPU/select.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/select.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/select.ll @@ -18,7 +18,7 @@ ; CHECK-LABEL: @store_select_group_flat( ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1 ; CHECK: store i32 -1, i32 addrspace(3)* %select -define void @store_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 { +define amdgpu_kernel void @store_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 { %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)* %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1 @@ -43,7 +43,7 @@ ; CHECK: %2 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)* ; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* %2 ; CHECK: store i32 -1, i32 addrspace(4)* %select -define void @store_select_mismatch_group_private_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32* %private.ptr.1) #0 { +define amdgpu_kernel void @store_select_mismatch_group_private_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32* %private.ptr.1) #0 { %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* %cast1 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)* %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1 @@ -73,7 +73,7 @@ ; CHECK-LABEL: @store_select_group_flat_null( ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*) ; CHECK: store i32 -1, i32 addrspace(3)* %select -define void @store_select_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { +define amdgpu_kernel void @store_select_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null store i32 -1, i32 addrspace(4)* %select @@ -83,7 +83,7 @@ ; CHECK-LABEL: @store_select_group_flat_null_swap( ; CHECK: %select = select i1 %c, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*), i32 addrspace(3)* %group.ptr.0 ; CHECK: store i32 -1, i32 addrspace(3)* %select -define void @store_select_group_flat_null_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { +define amdgpu_kernel void @store_select_group_flat_null_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* %select = select i1 %c, i32 addrspace(4)* null, i32 addrspace(4)* %cast0 store i32 -1, i32 addrspace(4)* %select @@ -93,7 +93,7 @@ ; CHECK-LABEL: @store_select_group_flat_undef( ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* undef ; CHECK: store i32 -1, i32 addrspace(3)* %select -define void @store_select_group_flat_undef(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { +define amdgpu_kernel void @store_select_group_flat_undef(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* undef store i32 -1, i32 addrspace(4)* %select @@ -103,7 +103,7 @@ ; CHECK-LABEL: @store_select_group_flat_undef_swap( ; CHECK: %select = select i1 %c, i32 addrspace(3)* undef, i32 addrspace(3)* %group.ptr.0 ; CHECK: store i32 -1, i32 addrspace(3)* %select -define void @store_select_group_flat_undef_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { +define amdgpu_kernel void @store_select_group_flat_undef_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* %select = select i1 %c, i32 addrspace(4)* undef, i32 addrspace(4)* %cast0 store i32 -1, i32 addrspace(4)* %select @@ -114,7 +114,7 @@ ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*) ; CHECK: %gep = getelementptr i32, i32 addrspace(3)* %select, i64 16 ; CHECK: store i32 -1, i32 addrspace(3)* %gep -define void @store_select_gep_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { +define amdgpu_kernel void @store_select_gep_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null %gep = getelementptr i32, i32 addrspace(4)* %select, i64 16 @@ -127,7 +127,7 @@ ; CHECK-LABEL: @store_select_group_flat_constexpr( ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* @lds1 ; CHECK: store i32 7, i32 addrspace(3)* %select -define void @store_select_group_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { +define amdgpu_kernel void @store_select_group_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds1 to i32 addrspace(4)*) store i32 7, i32 addrspace(4)* %select @@ -137,7 +137,7 @@ ; CHECK-LABEL: @store_select_group_flat_inttoptr_flat( ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*) to i32 addrspace(3)*) ; CHECK: store i32 7, i32 addrspace(3)* %select -define void @store_select_group_flat_inttoptr_flat(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { +define amdgpu_kernel void @store_select_group_flat_inttoptr_flat(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*) store i32 7, i32 addrspace(4)* %select @@ -147,7 +147,7 @@ ; CHECK-LABEL: @store_select_group_flat_inttoptr_group( ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*) ; CHECK-NEXT: store i32 7, i32 addrspace(3)* %select -define void @store_select_group_flat_inttoptr_group(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { +define amdgpu_kernel void @store_select_group_flat_inttoptr_group(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*) to i32 addrspace(4)*) store i32 7, i32 addrspace(4)* %select @@ -158,7 +158,7 @@ ; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* ; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*) ; CHECK: store i32 7, i32 addrspace(4)* %select -define void @store_select_group_global_mismatch_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { +define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*) store i32 7, i32 addrspace(4)* %select @@ -169,7 +169,7 @@ ; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* ; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %1 ; CHECK: store i32 7, i32 addrspace(4)* %select -define void @store_select_group_global_mismatch_flat_constexpr_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { +define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %cast0 store i32 7, i32 addrspace(4)* %select @@ -179,7 +179,7 @@ ; CHECK-LABEL: @store_select_group_global_mismatch_null_null( ; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*) ; CHECK: store i32 7, i32 addrspace(4)* %select -define void @store_select_group_global_mismatch_null_null(i1 %c) #0 { +define amdgpu_kernel void @store_select_group_global_mismatch_null_null(i1 %c) #0 { %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*) store i32 7, i32 addrspace(4)* %select ret void @@ -187,42 +187,42 @@ ; CHECK-LABEL: @store_select_group_global_mismatch_null_null_constexpr( ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 -define void @store_select_group_global_mismatch_null_null_constexpr() #0 { +define amdgpu_kernel void @store_select_group_global_mismatch_null_null_constexpr() #0 { store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_gv_null_constexpr( ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 -define void @store_select_group_global_mismatch_gv_null_constexpr() #0 { +define amdgpu_kernel void @store_select_group_global_mismatch_gv_null_constexpr() #0 { store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_null_gv_constexpr( ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4 -define void @store_select_group_global_mismatch_null_gv_constexpr() #0 { +define amdgpu_kernel void @store_select_group_global_mismatch_null_gv_constexpr() #0 { store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4 ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_null_constexpr( ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 -define void @store_select_group_global_mismatch_inttoptr_null_constexpr() #0 { +define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_null_constexpr() #0 { store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_flat_null_constexpr( ; CHECK: store i32 7, i32 addrspace(1)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(1)* addrspacecast (i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*) to i32 addrspace(1)*), i32 addrspace(1)* null), align 4 -define void @store_select_group_global_mismatch_inttoptr_flat_null_constexpr() #0 { +define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_flat_null_constexpr() #0 { store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4 ret void } ; CHECK-LABEL: @store_select_group_global_mismatch_undef_undef_constexpr( ; CHECK: store i32 7, i32 addrspace(3)* null -define void @store_select_group_global_mismatch_undef_undef_constexpr() #0 { +define amdgpu_kernel void @store_select_group_global_mismatch_undef_undef_constexpr() #0 { store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* undef to i32 addrspace(4)*)), align 4 ret void } @@ -233,7 +233,7 @@ ; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* ; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*) ; CHECK: store i32 7, i32 addrspace(4)* %select -define void @store_select_group_constexpr_ptrtoint(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { +define amdgpu_kernel void @store_select_group_constexpr_ptrtoint(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 { %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)* %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*) store i32 7, i32 addrspace(4)* %select @@ -248,7 +248,7 @@ ; CHECK: %extract1 = extractelement <2 x i32 addrspace(4)*> %select, i32 1 ; CHECK: store i32 -1, i32 addrspace(4)* %extract0 ; CHECK: store i32 -2, i32 addrspace(4)* %extract1 -define void @store_select_group_flat_vector(i1 %c, <2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 { +define amdgpu_kernel void @store_select_group_flat_vector(i1 %c, <2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 { %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32 addrspace(4)*> %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32 addrspace(4)*> %select = select i1 %c, <2 x i32 addrspace(4)*> %cast0, <2 x i32 addrspace(4)*> %cast1 Index: test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: @volatile_load_flat_from_global( ; CHECK: load volatile i32, i32 addrspace(4)* ; CHECK: store i32 %val, i32 addrspace(1)* -define void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 { +define amdgpu_kernel void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 { %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)* %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)* %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4 @@ -16,7 +16,7 @@ ; CHECK-LABEL: @volatile_load_flat_from_constant( ; CHECK: load volatile i32, i32 addrspace(4)* ; CHECK: store i32 %val, i32 addrspace(1)* -define void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 { +define amdgpu_kernel void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 { %tmp0 = addrspacecast i32 addrspace(2)* %input to i32 addrspace(4)* %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)* %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4 @@ -27,7 +27,7 @@ ; CHECK-LABEL: @volatile_load_flat_from_group( ; CHECK: load volatile i32, i32 addrspace(4)* ; CHECK: store i32 %val, i32 addrspace(3)* -define void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 { +define amdgpu_kernel void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 { %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)* %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)* %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4 @@ -38,7 +38,7 @@ ; CHECK-LABEL: @volatile_load_flat_from_private( ; CHECK: load volatile i32, i32 addrspace(4)* ; CHECK: store i32 %val, i32* -define void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocapture %output) #0 { +define amdgpu_kernel void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocapture %output) #0 { %tmp0 = addrspacecast i32* %input to i32 addrspace(4)* %tmp1 = addrspacecast i32* %output to i32 addrspace(4)* %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4 @@ -49,7 +49,7 @@ ; CHECK-LABEL: @volatile_store_flat_to_global( ; CHECK: load i32, i32 addrspace(1)* ; CHECK: store volatile i32 %val, i32 addrspace(4)* -define void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 { +define amdgpu_kernel void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 { %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)* %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)* %val = load i32, i32 addrspace(4)* %tmp0, align 4 @@ -60,7 +60,7 @@ ; CHECK-LABEL: @volatile_store_flat_to_group( ; CHECK: load i32, i32 addrspace(3)* ; CHECK: store volatile i32 %val, i32 addrspace(4)* -define void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 { +define amdgpu_kernel void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 { %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)* %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)* %val = load i32, i32 addrspace(4)* %tmp0, align 4 @@ -71,7 +71,7 @@ ; CHECK-LABEL: @volatile_store_flat_to_private( ; CHECK: load i32, i32* ; CHECK: store volatile i32 %val, i32 addrspace(4)* -define void @volatile_store_flat_to_private(i32* nocapture %input, i32* nocapture %output) #0 { +define amdgpu_kernel void @volatile_store_flat_to_private(i32* nocapture %input, i32* nocapture %output) #0 { %tmp0 = addrspacecast i32* %input to i32 addrspace(4)* %tmp1 = addrspacecast i32* %output to i32 addrspace(4)* %val = load i32, i32 addrspace(4)* %tmp0, align 4 @@ -119,7 +119,7 @@ ; CHECK-LABEL: @volatile_memset_group_to_flat( ; CHECK: addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* ; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true) -define void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 { +define amdgpu_kernel void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 { %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true) ret void @@ -128,7 +128,7 @@ ; CHECK-LABEL: @volatile_memset_global_to_flat( ; CHECK: addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* ; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true) -define void @volatile_memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 { +define amdgpu_kernel void @volatile_memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 { %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true) ret void Index: test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll @@ -15,7 +15,7 @@ ; NOSCOPE: load float ; NOSCOPE: store float ; NOSCOPE: store float -define void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 { +define amdgpu_kernel void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 { entry: %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1 store float 0.0, float addrspace(1)* %a, align 4, !noalias !0 Index: test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll @@ -10,7 +10,7 @@ ; ALIGNED: load i8, i8* %ptr0, align 1{{$}} ; ALIGNED: load i8, i8* %ptr1, align 1{{$}} -define void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 { +define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i8], align 1 %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset %val0 = load i8, i8* %ptr0, align 1 @@ -27,7 +27,7 @@ ; ALIGNED: load i16, i16* %ptr0, align 1{{$}} ; ALIGNED: load i16, i16* %ptr1, align 1{{$}} -define void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 { +define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i16], align 1 %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset %val0 = load i16, i16* %ptr0, align 1 @@ -47,7 +47,7 @@ ; ALIGNED: load i32, i32* %ptr0, align 1 ; ALIGNED: load i32, i32* %ptr1, align 1 -define void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { +define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i32], align 1 %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset %val0 = load i32, i32* %ptr0, align 1 @@ -68,7 +68,7 @@ ; FIXME: Should change alignment ; ALIGNED: load i32 ; ALIGNED: load i32 -define void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { +define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i32], align 16 %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset %val0 = load i32, i32* %ptr0, align 1 @@ -85,7 +85,7 @@ ; ALIGNED: store i8 9, i8* %ptr0, align 1{{$}} ; ALIGNED: store i8 10, i8* %ptr1, align 1{{$}} -define void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 { +define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i8], align 1 %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset store i8 9, i8* %ptr0, align 1 @@ -100,7 +100,7 @@ ; ALIGNED: store i16 9, i16* %ptr0, align 1{{$}} ; ALIGNED: store i16 10, i16* %ptr1, align 1{{$}} -define void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 { +define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i16], align 1 %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset store i16 9, i16* %ptr0, align 1 @@ -119,7 +119,7 @@ ; ALIGNED: store i32 9, i32* %ptr0, align 1 ; ALIGNED: store i32 10, i32* %ptr1, align 1 -define void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { +define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i32], align 1 %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset store i32 9, i32* %ptr0, align 1 Index: test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll @@ -8,7 +8,7 @@ ; CHECK: sext i32 %id.x to i64 ; CHECK: load <2 x float> ; CHECK: store <2 x float> zeroinitializer -define void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 { +define amdgpu_kernel void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 { entry: %id.x = call i32 @llvm.amdgcn.workitem.id.x() %sext.id.x = sext i32 %id.x to i64 @@ -32,7 +32,7 @@ ; CHECK: zext i32 %id.x to i64 ; CHECK: load <2 x float> ; CHECK: store <2 x float> -define void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 { +define amdgpu_kernel void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 { entry: %id.x = call i32 @llvm.amdgcn.workitem.id.x() %zext.id.x = zext i32 %id.x to i64 @@ -54,7 +54,7 @@ ; CHECK-LABEL: @merge_op_zext_index( ; CHECK: load <2 x float> ; CHECK: store <2 x float> -define void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 { +define amdgpu_kernel void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 { entry: %id.x = call i32 @llvm.amdgcn.workitem.id.x() %shl = shl i32 %id.x, 2 @@ -81,7 +81,7 @@ ; CHECK-LABEL: @merge_op_sext_index( ; CHECK: load <2 x float> ; CHECK: store <2 x float> -define void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 { +define amdgpu_kernel void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 { entry: %id.x = call i32 @llvm.amdgcn.workitem.id.x() %shl = shl i32 %id.x, 2 @@ -112,7 +112,7 @@ ; CHECK: loop: ; CHECK: load <2 x i32> ; CHECK: store <2 x i32> -define void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 { +define amdgpu_kernel void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 { entry: %cmp0 = icmp eq i32 %n, 0 br i1 %cmp0, label %exit, label %loop Index: test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll @@ -11,7 +11,7 @@ ; CHECK: load <2 x float> ; CHECK: %w = add i32 %y, 9 ; CHECK: %foo = add i32 %z, %w -define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 { entry: %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx @@ -38,7 +38,7 @@ ; CHECK: %w = add i32 %y, 9 ; CHECK: store <2 x float> ; CHECK: %foo = add i32 %z, %w -define void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 { entry: %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx Index: test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll @@ -8,7 +8,7 @@ ; CHECK: store double 0.000000e+00, double addrspace(1)* %a, ; CHECK: load double ; CHECK: store double 0.000000e+00, double addrspace(1)* %a.idx.1 -define void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 { +define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 { entry: %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1 Index: test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll @@ -17,7 +17,7 @@ ; ELT8-UNALIGNED: store <2 x i32> ; ELT16-UNALIGNED: store <4 x i32> -define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 { +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 { %out.gep.1 = getelementptr i32, i32* %out, i32 1 %out.gep.2 = getelementptr i32, i32* %out, i32 2 %out.gep.3 = getelementptr i32, i32* %out, i32 3 @@ -44,7 +44,7 @@ ; ELT4-UNALIGNED: store i32 ; ELT4-UNALIGNED: store i32 ; ELT4-UNALIGNED: store i32 -define void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 { +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 { %out.gep.1 = getelementptr i32, i32* %out, i32 1 %out.gep.2 = getelementptr i32, i32* %out, i32 2 %out.gep.3 = getelementptr i32, i32* %out, i32 3 @@ -71,7 +71,7 @@ ; ELT4-UNALIGNED: store i32 ; ELT4-UNALIGNED: store i32 ; ELT4-UNALIGNED: store i32 -define void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 { +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 { %out.gep.1 = getelementptr i32, i32* %out, i32 1 %out.gep.2 = getelementptr i32, i32* %out, i32 2 %out.gep.3 = getelementptr i32, i32* %out, i32 3 @@ -85,7 +85,7 @@ ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8( ; ALL: store <4 x i8> -define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 { +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 { %out.gep.1 = getelementptr i8, i8* %out, i32 1 %out.gep.2 = getelementptr i8, i8* %out, i32 2 %out.gep.3 = getelementptr i8, i8* %out, i32 3 @@ -104,7 +104,7 @@ ; ALIGNED: store i8 ; UNALIGNED: store <4 x i8> , <4 x i8>* %1, align 1 -define void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 { +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 { %out.gep.1 = getelementptr i8, i8* %out, i32 1 %out.gep.2 = getelementptr i8, i8* %out, i32 2 %out.gep.3 = getelementptr i8, i8* %out, i32 3 @@ -118,7 +118,7 @@ ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16( ; ALL: store <2 x i16> -define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 { +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 { %out.gep.1 = getelementptr i16, i16* %out, i32 1 store i16 9, i16* %out, align 4 @@ -131,7 +131,7 @@ ; ALIGNED: store i16 ; UNALIGNED: store <2 x i16> , <2 x i16>* %1, align 2 -define void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 { +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 { %out.gep.1 = getelementptr i16, i16* %out, i32 1 store i16 9, i16* %out, align 2 @@ -144,7 +144,7 @@ ; ALIGNED: store i16 ; UNALIGNED: store <2 x i16> , <2 x i16>* %1, align 1 -define void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 { +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 { %out.gep.1 = getelementptr i16, i16* %out, i32 1 store i16 9, i16* %out, align 1 @@ -154,7 +154,7 @@ ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8( ; ALL: store <2 x i16> , <2 x i16>* %1, align 8 -define void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 { +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 { %out.gep.1 = getelementptr i16, i16* %out, i32 1 store i16 9, i16* %out, align 8 @@ -179,7 +179,7 @@ ; ELT16-ALIGNED: store i32 ; ELT16-UNALIGNED: store <3 x i32> -define void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 { +define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 { %out.gep.1 = getelementptr i32, i32* %out, i32 1 %out.gep.2 = getelementptr i32, i32* %out, i32 2 @@ -202,7 +202,7 @@ ; ELT8-UNALIGNED: store i32 ; ELT16-UNALIGNED: store <3 x i32> -define void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 { +define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 { %out.gep.1 = getelementptr i32, i32* %out, i32 1 %out.gep.2 = getelementptr i32, i32* %out, i32 2 @@ -218,7 +218,7 @@ ; ALIGNED: store i8 ; UNALIGNED: store <3 x i8> -define void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 { +define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 { %out.gep.1 = getelementptr i8, i8* %out, i8 1 %out.gep.2 = getelementptr i8, i8* %out, i8 2 Index: test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll @@ -10,7 +10,7 @@ ; CHECK-LABEL: @merge_global_store_2_constants_i8( ; CHECK: store <2 x i8> , <2 x i8> addrspace(1)* %{{[0-9]+}}, align 2 -define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 123, i8 addrspace(1)* %out.gep.1 @@ -20,7 +20,7 @@ ; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align ; CHECK: store <2 x i8> -define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 123, i8 addrspace(1)* %out.gep.1 @@ -30,7 +30,7 @@ ; CHECK-LABEL: @merge_global_store_2_constants_i16 ; CHECK: store <2 x i16> , <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4 -define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 123, i16 addrspace(1)* %out.gep.1 @@ -40,7 +40,7 @@ ; CHECK-LABEL: @merge_global_store_2_constants_0_i16 ; CHECK: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4 -define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 0, i16 addrspace(1)* %out.gep.1 @@ -50,7 +50,7 @@ ; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align ; CHECK: store <2 x i16> -define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 123, i16 addrspace(1)* %out.gep.1 @@ -60,7 +60,7 @@ ; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align ; CHECK: store <2 x half> -define void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 { %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 store half 2.0, half addrspace(1)* %out.gep.1 @@ -70,7 +70,7 @@ ; CHECK-LABEL: @merge_global_store_2_constants_i32 ; CHECK: store <2 x i32> , <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 store i32 123, i32 addrspace(1)* %out.gep.1 @@ -80,7 +80,7 @@ ; CHECK-LABEL: @merge_global_store_2_constants_i32_f32 ; CHECK: store <2 x i32> , <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* store float 1.0, float addrspace(1)* %out.gep.1.bc @@ -90,7 +90,7 @@ ; CHECK-LABEL: @merge_global_store_2_constants_f32_i32 ; CHECK store <2 x float> , <2 x float> addrspace(1)* %{{[0-9]+$}} -define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* store i32 123, i32 addrspace(1)* %out.gep.1.bc @@ -100,7 +100,7 @@ ; CHECK-LABEL: @merge_global_store_4_constants_i32 ; CHECK: store <4 x i32> , <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -114,7 +114,7 @@ ; CHECK-LABEL: @merge_global_store_4_constants_f32_order ; CHECK: store <4 x float> , <4 x float> addrspace(1)* %{{[0-9]+}} -define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 @@ -129,7 +129,7 @@ ; First store is out of order. ; CHECK-LABEL: @merge_global_store_4_constants_f32 ; CHECK: store <4 x float> , <4 x float> addrspace(1)* %{{[0-9]+}}, align 4 -define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 @@ -143,7 +143,7 @@ ; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32 ; CHECK: store <4 x i32> , <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 @@ -160,7 +160,7 @@ ; CHECK-LABEL: @merge_global_store_3_constants_i32 ; CHECK: store <3 x i32> , <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 @@ -172,7 +172,7 @@ ; CHECK-LABEL: @merge_global_store_2_constants_i64 ; CHECK: store <2 x i64> , <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8 -define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 store i64 123, i64 addrspace(1)* %out.gep.1 @@ -183,7 +183,7 @@ ; CHECK-LABEL: @merge_global_store_4_constants_i64 ; CHECK: store <2 x i64> , <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8 ; CHECK: store <2 x i64> , <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8 -define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 @@ -202,7 +202,7 @@ ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT0]], i32 0 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT1]], i32 1 ; CHECK: store <2 x i32> [[INSERT1]] -define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 @@ -220,7 +220,7 @@ ; CHECK: insertelement ; CHECK: insertelement ; CHECK: store <2 x i32> -define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 @@ -241,7 +241,7 @@ ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT1]], i32 0 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT0]], i32 1 ; CHECK: store <2 x i32> [[INSERT1]] -define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 @@ -256,7 +256,7 @@ ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32 ; CHECK: load <4 x i32> ; CHECK: store <4 x i32> -define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -279,7 +279,7 @@ ; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32 ; CHECK: load <3 x i32> ; CHECK: store <3 x i32> -define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 @@ -298,7 +298,7 @@ ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32 ; CHECK: load <4 x float> ; CHECK: store <4 x float> -define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 @@ -321,7 +321,7 @@ ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base ; CHECK: load <4 x i32> ; CHECK: store <4 x i32> -define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 @@ -346,7 +346,7 @@ ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32 ; CHECK: load <4 x i32> ; CHECK: store <4 x i32> -define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -373,7 +373,7 @@ ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32 ; CHECK: load <4 x i32> ; CHECK: store <4 x i32> -define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -408,7 +408,7 @@ ; CHECK: insertelement <4 x i8> ; CHECK: insertelement <4 x i8> ; CHECK: store <4 x i8> -define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 @@ -431,7 +431,7 @@ ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align ; CHECK: load <4 x i8> ; CHECK: store <4 x i8> -define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 @@ -454,7 +454,7 @@ ; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32 ; CHECK: load <4 x i32> ; CHECK: store <4 x i32> -define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -474,7 +474,7 @@ ; CHECK-LABEL: @merge_local_store_2_constants_i8 ; CHECK: store <2 x i8> , <2 x i8> addrspace(3)* %{{[0-9]+}}, align 2 -define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { +define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 store i8 123, i8 addrspace(3)* %out.gep.1 @@ -484,7 +484,7 @@ ; CHECK-LABEL: @merge_local_store_2_constants_i32 ; CHECK: store <2 x i32> , <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4 -define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { +define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 store i32 123, i32 addrspace(3)* %out.gep.1 @@ -495,7 +495,7 @@ ; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2 ; CHECK: store i32 ; CHECK: store i32 -define void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 { +define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 store i32 123, i32 addrspace(3)* %out.gep.1, align 2 @@ -506,7 +506,7 @@ ; CHECK-LABEL: @merge_local_store_4_constants_i32 ; CHECK: store <2 x i32> , <2 x i32> addrspace(3)* ; CHECK: store <2 x i32> , <2 x i32> addrspace(3)* -define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { +define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 @@ -521,7 +521,7 @@ ; CHECK-LABEL: @merge_global_store_5_constants_i32 ; CHECK: store <4 x i32> , <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 ; CHECK: store i32 -define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { store i32 9, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 12, i32 addrspace(1)* %idx1, align 4 @@ -537,7 +537,7 @@ ; CHECK-LABEL: @merge_global_store_6_constants_i32 ; CHECK: store <4 x i32> , <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 ; CHECK: store <2 x i32> , <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { store i32 13, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 15, i32 addrspace(1)* %idx1, align 4 @@ -555,7 +555,7 @@ ; CHECK-LABEL: @merge_global_store_7_constants_i32 ; CHECK: store <4 x i32> , <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 ; CHECK: store <3 x i32> , <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 999, i32 addrspace(1)* %idx1, align 4 @@ -575,7 +575,7 @@ ; CHECK-LABEL: @merge_global_store_8_constants_i32 ; CHECK: store <4 x i32> , <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 ; CHECK: store <4 x i32> , <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 999, i32 addrspace(1)* %idx1, align 4 @@ -597,7 +597,7 @@ ; CHECK-LABEL: @copy_v3i32_align4 ; CHECK: %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 ; CHECK: store <3 x i32> %vec, <3 x i32> addrspace(1)* %out -define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 store <3 x i32> %vec, <3 x i32> addrspace(1)* %out ret void @@ -606,7 +606,7 @@ ; CHECK-LABEL: @copy_v3i64_align4 ; CHECK: %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 ; CHECK: store <3 x i64> %vec, <3 x i64> addrspace(1)* %out -define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 store <3 x i64> %vec, <3 x i64> addrspace(1)* %out ret void @@ -615,7 +615,7 @@ ; CHECK-LABEL: @copy_v3f32_align4 ; CHECK: %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 ; CHECK: store <3 x float> -define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 %fadd = fadd <3 x float> %vec, store <3 x float> %fadd, <3 x float> addrspace(1)* %out @@ -625,7 +625,7 @@ ; CHECK-LABEL: @copy_v3f64_align4 ; CHECK: %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 ; CHECK: store <3 x double> %fadd, <3 x double> addrspace(1)* %out -define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 %fadd = fadd <3 x double> %vec, store <3 x double> %fadd, <3 x double> addrspace(1)* %out Index: test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: @merge_v2i32_v2i32( ; CHECK: load <4 x i32> ; CHECK: store <4 x i32> zeroinitializer -define void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 { +define amdgpu_kernel void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 { entry: %a.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 1 %b.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 1 @@ -22,7 +22,7 @@ ; CHECK-LABEL: @merge_v1i32_v1i32( ; CHECK: load <2 x i32> ; CHECK: store <2 x i32> zeroinitializer -define void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 { +define amdgpu_kernel void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 { entry: %a.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %a, i64 1 %b.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %b, i64 1 @@ -41,7 +41,7 @@ ; CHECK: load <3 x i32> ; CHECK: store <3 x i32> zeroinitializer ; CHECK: store <3 x i32> zeroinitializer -define void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 { +define amdgpu_kernel void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 { entry: %a.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a, i64 1 %b.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b, i64 1 @@ -58,7 +58,7 @@ ; CHECK-LABEL: @merge_v2i16_v2i16( ; CHECK: load <4 x i16> ; CHECK: store <4 x i16> zeroinitializer -define void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 { +define amdgpu_kernel void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 { entry: %a.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a, i64 1 %b.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b, i64 1 @@ -76,7 +76,7 @@ ; CHECK-LABEL: @merge_load_i32_v2i16( ; CHECK: load i32, ; CHECK: load <2 x i16> -define void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 { +define amdgpu_kernel void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 { entry: %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 1 %a.1.cast = bitcast i32 addrspace(1)* %a.1 to <2 x i16> addrspace(1)* Index: test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll @@ -7,7 +7,7 @@ ; CHECK-LABEL: @load_keep_base_alignment_missing_align( ; CHECK: load <2 x float>, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4 -define void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) { +define amdgpu_kernel void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) { %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11 %val0 = load float, float addrspace(3)* %ptr0 @@ -21,7 +21,7 @@ ; CHECK-LABEL: @store_keep_base_alignment_missing_align( ; CHECK: store <2 x float> zeroinitializer, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4 -define void @store_keep_base_alignment_missing_align() { +define amdgpu_kernel void @store_keep_base_alignment_missing_align() { %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2 store float 0.0, float addrspace(3)* %arrayidx0 Index: test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll @@ -11,7 +11,7 @@ ; CHECK: store i32 0 ; CHECK: store i32 0 -define void @no_crash(i32 %arg) { +define amdgpu_kernel void @no_crash(i32 %arg) { %tmp2 = add i32 %arg, 14 %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2 %tmp4 = add i32 %arg, 15 @@ -37,7 +37,7 @@ ; CHECK: load i32 ; CHECK: load i32 -define void @interleave_get_longest(i32 %arg) { +define amdgpu_kernel void @interleave_get_longest(i32 %arg) { %a1 = add i32 %arg, 1 %a2 = add i32 %arg, 2 %a3 = add i32 %arg, 3 Index: test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll @@ -5,7 +5,7 @@ ; CHECK: store i32 ; CHECK: store i32 ; CHECK: store i32 -define void @no_implicit_float(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @no_implicit_float(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 Index: test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: @optnone( ; CHECK: store i32 ; CHECK: store i32 -define void @optnone(i32 addrspace(1)* %out) noinline optnone { +define amdgpu_kernel void @optnone(i32 addrspace(1)* %out) noinline optnone { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 store i32 123, i32 addrspace(1)* %out.gep.1 @@ -13,7 +13,7 @@ ; CHECK-LABEL: @do_opt( ; CHECK: store <2 x i32> -define void @do_opt(i32 addrspace(1)* %out) { +define amdgpu_kernel void @do_opt(i32 addrspace(1)* %out) { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 store i32 123, i32 addrspace(1)* %out.gep.1 Index: test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll @@ -9,7 +9,7 @@ ; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)* ; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)* ; CHECK: store <2 x i64> zeroinitializer -define void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 { +define amdgpu_kernel void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 { entry: %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1 %b.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, i64 1 @@ -28,7 +28,7 @@ ; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)* ; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)* ; CHECK: store <2 x i32> zeroinitializer -define void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 { +define amdgpu_kernel void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 { entry: %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i64 1 %b.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, i64 1 @@ -46,7 +46,7 @@ ; CHECK: load <2 x i64> ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1 ; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)* -define void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 { +define amdgpu_kernel void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 { entry: %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)* @@ -61,7 +61,7 @@ ; CHECK: load <2 x i64> ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0 ; CHECK: inttoptr i64 [[ELT0]] to i8 addrspace(1)* -define void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 { +define amdgpu_kernel void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 { entry: %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)* %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 @@ -76,7 +76,7 @@ ; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64 ; CHECK: insertelement <2 x i64> undef, i64 [[ELT0]], i32 0 ; CHECK: store <2 x i64> -define void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 { +define amdgpu_kernel void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 { entry: %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)* %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 @@ -92,7 +92,7 @@ ; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1]], i32 1 ; CHECK: store <2 x i64> -define void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 { +define amdgpu_kernel void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 { entry: %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1 %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to i64 addrspace(1)* @@ -107,7 +107,7 @@ ; CHECK: load <2 x i32> ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1 ; CHECK: inttoptr i32 [[ELT1]] to i8 addrspace(3)* -define void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 { +define amdgpu_kernel void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 { entry: %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1 %a.1.cast = bitcast i32 addrspace(3)* %a.1 to i8 addrspace(3)* addrspace(3)* @@ -122,7 +122,7 @@ ; CHECK: load <2 x i32> ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0 ; CHECK: inttoptr i32 [[ELT0]] to i8 addrspace(3)* -define void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 { +define amdgpu_kernel void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 { entry: %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)* %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1 @@ -137,7 +137,7 @@ ; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr0 to i32 ; CHECK: insertelement <2 x i32> undef, i32 [[ELT0]], i32 0 ; CHECK: store <2 x i32> -define void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 { +define amdgpu_kernel void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 { entry: %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)* %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1 @@ -152,7 +152,7 @@ ; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr1 to i32 ; CHECK: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1 ; CHECK: store <2 x i32> -define void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 { +define amdgpu_kernel void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 { entry: %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i32 1 %a.cast = bitcast i8 addrspace(3)* addrspace(3)* %a to i32 addrspace(3)* @@ -166,7 +166,7 @@ ; CHECK-LABEL: @no_merge_store_ptr32_i64( ; CHECK: store i8 addrspace(3)* ; CHECK: store i64 -define void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 { +define amdgpu_kernel void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 { entry: %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)* %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 @@ -181,7 +181,7 @@ ; CHECK-LABEL: @no_merge_store_i64_ptr32( ; CHECK: store i64 ; CHECK: store i8 addrspace(3)* -define void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 { +define amdgpu_kernel void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 { entry: %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a, i64 1 %a.cast = bitcast i8 addrspace(3)* addrspace(1)* %a to i64 addrspace(1)* @@ -195,7 +195,7 @@ ; CHECK-LABEL: @no_merge_load_i64_ptr32( ; CHECK: load i64, ; CHECK: load i8 addrspace(3)*, -define void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 { +define amdgpu_kernel void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 { entry: %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(3)* addrspace(1)* @@ -209,7 +209,7 @@ ; CHECK-LABEL: @no_merge_load_ptr32_i64( ; CHECK: load i8 addrspace(3)*, ; CHECK: load i64, -define void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 { +define amdgpu_kernel void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 { entry: %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)* %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 @@ -226,7 +226,7 @@ ; CHECK: load <2 x i8 addrspace(1)*> ; CHECK: store <2 x i8 addrspace(1)*> ; CHECK: store <2 x i8 addrspace(1)*> -define void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 { +define amdgpu_kernel void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 { entry: %a.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %a, i64 1 %b.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, i64 1 @@ -245,7 +245,7 @@ ; CHECK: [[ELT0_INT:%[^ ]+]] = inttoptr i64 [[ELT0]] to i8 addrspace(1)* ; CHECK: [[ELT1_INT:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1 ; CHECK: bitcast i64 [[ELT1_INT]] to double -define void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 { +define amdgpu_kernel void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 { entry: %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)* %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 @@ -262,7 +262,7 @@ ; CHECK: bitcast i64 [[ELT0]] to double ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1 ; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)* -define void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 { +define amdgpu_kernel void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 { entry: %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 %a.1.cast = bitcast double addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)* @@ -279,7 +279,7 @@ ; CHECK: [[ELT1_INT:%[^ ]+]] = bitcast double %val1 to i64 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1 ; CHECK: store <2 x i64> -define void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 { +define amdgpu_kernel void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 { entry: %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)* %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 @@ -296,7 +296,7 @@ ; CHECK: [[ELT1_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1 ; CHECK: store <2 x i64> -define void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 { +define amdgpu_kernel void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 { entry: %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1 %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to double addrspace(1)* Index: test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll @@ -9,7 +9,7 @@ ; CHECK: store <4 x float> ; Function Attrs: nounwind -define void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 { +define amdgpu_kernel void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 { bb: %tmp = bitcast i8 addrspace(1)* %b to float addrspace(1)* %tmp1 = load float, float addrspace(1)* %tmp, align 4 Index: test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll @@ -16,7 +16,7 @@ ; CHECK-LABEL: @merge_store_2_constants_i1( ; CHECK: store i1 ; CHECK: store i1 -define void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1 store i1 true, i1 addrspace(1)* %out.gep.1 store i1 false, i1 addrspace(1)* %out @@ -26,7 +26,7 @@ ; CHECK-LABEL: @merge_store_2_constants_i2( ; CHECK: store i2 1 ; CHECK: store i2 -1 -define void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1 store i2 1, i2 addrspace(1)* %out.gep.1 store i2 -1, i2 addrspace(1)* %out @@ -36,7 +36,7 @@ ; CHECK-LABEL: @merge_different_store_sizes_i1_i8( ; CHECK: store i1 true ; CHECK: store i8 123 -define void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 { %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)* %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i1 true, i1 addrspace(1)* %out.i1 @@ -47,7 +47,7 @@ ; CHECK-LABEL: @merge_different_store_sizes_i8_i1( ; CHECK: store i8 123 ; CHECK: store i1 true -define void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 { %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)* %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1 store i8 123, i8 addrspace(1)* %out.gep.1 @@ -58,7 +58,7 @@ ; CHECK-LABEL: @merge_store_2_constant_structs( ; CHECK: store %struct.foo ; CHECK: store %struct.foo -define void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 { %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1 store %struct.foo { i32 12, i8 3 }, %struct.foo addrspace(1)* %out.gep.1 store %struct.foo { i32 92, i8 9 }, %struct.foo addrspace(1)* %out @@ -69,7 +69,7 @@ ; CHECK-LABEL: @merge_store_2_constants_v2i2( ; CHECK: store <2 x i2> ; CHECK: store <2 x i2> -define void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 { %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1 store <2 x i2> , <2 x i2> addrspace(1)* %out.gep.1 store <2 x i2> , <2 x i2> addrspace(1)* %out @@ -81,7 +81,7 @@ ; CHECK-LABEL: @merge_store_2_constants_v4i2( ; CHECK: store <4 x i2> ; CHECK: store <4 x i2> -define void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 { %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1 store <4 x i2> , <4 x i2> addrspace(1)* %out.gep.1 store <4 x i2> , <4 x i2> addrspace(1)* %out @@ -91,7 +91,7 @@ ; CHECK-LABEL: @merge_load_2_constants_i1( ; CHECK: load i1 ; CHECK: load i1 -define void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1 %x = load i1, i1 addrspace(1)* %out.gep.1 %y = load i1, i1 addrspace(1)* %out @@ -103,7 +103,7 @@ ; CHECK-LABEL: @merge_load_2_constants_i2( ; CHECK: load i2 ; CHECK: load i2 -define void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1 %x = load i2, i2 addrspace(1)* %out.gep.1 %y = load i2, i2 addrspace(1)* %out @@ -115,7 +115,7 @@ ; CHECK-LABEL: @merge_different_load_sizes_i1_i8( ; CHECK: load i1 ; CHECK: load i8 -define void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 { %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)* %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 %x = load i1, i1 addrspace(1)* %out.i1 @@ -128,7 +128,7 @@ ; CHECK-LABEL: @merge_different_load_sizes_i8_i1( ; CHECK: load i8 ; CHECK: load i1 -define void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 { %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)* %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1 %x = load i8, i8 addrspace(1)* %out.gep.1 @@ -141,7 +141,7 @@ ; CHECK-LABEL: @merge_load_2_constant_structs( ; CHECK: load %struct.foo ; CHECK: load %struct.foo -define void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 { %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1 %x = load %struct.foo, %struct.foo addrspace(1)* %out.gep.1 %y = load %struct.foo, %struct.foo addrspace(1)* %out @@ -153,7 +153,7 @@ ; CHECK-LABEL: @merge_load_2_constants_v2i2( ; CHECK: load <2 x i2> ; CHECK: load <2 x i2> -define void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 { %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1 %x = load <2 x i2>, <2 x i2> addrspace(1)* %out.gep.1 %y = load <2 x i2>, <2 x i2> addrspace(1)* %out @@ -165,7 +165,7 @@ ; CHECK-LABEL: @merge_load_2_constants_v4i2( ; CHECK: load <4 x i2> ; CHECK: load <4 x i2> -define void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 { %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1 %x = load <4 x i2>, <4 x i2> addrspace(1)* %out.gep.1 %y = load <4 x i2>, <4 x i2> addrspace(1)* %out @@ -177,7 +177,7 @@ ; CHECK-LABEL: @merge_store_2_constants_i9( ; CHECK: store i9 3 ; CHECK: store i9 -5 -define void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i9, i9 addrspace(1)* %out, i32 1 store i9 3, i9 addrspace(1)* %out.gep.1 store i9 -5, i9 addrspace(1)* %out @@ -187,7 +187,7 @@ ; CHECK-LABEL: @merge_load_2_constants_v2i9( ; CHECK: load <2 x i9> ; CHECK: load <2 x i9> -define void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 { %out.gep.1 = getelementptr <2 x i9>, <2 x i9> addrspace(1)* %out, i32 1 %x = load <2 x i9>, <2 x i9> addrspace(1)* %out.gep.1 %y = load <2 x i9>, <2 x i9> addrspace(1)* %out Index: test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll =================================================================== --- test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll +++ test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll @@ -17,7 +17,7 @@ ; OPT: %tmp7 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 undef seq_cst ; OPT: %0 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 %tmp8 seq_cst ; OPT: br i1 %exitcond -define void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { +define amdgpu_kernel void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { bb: %tmp = icmp sgt i32 %n, 0 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge @@ -54,7 +54,7 @@ ; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] ; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383 ; OPT: %tmp4 = cmpxchg i32 addrspace(3)* %scevgep4, i32 undef, i32 undef seq_cst monotonic -define void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { +define amdgpu_kernel void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { bb: %tmp = icmp sgt i32 %n, 0 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge Index: test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll =================================================================== --- test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll +++ test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll @@ -10,7 +10,7 @@ ; OPT: %lsr.iv2 = phi i8 addrspace(1)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ] ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv2, i64 4095 ; OPT: load i8, i8 addrspace(1)* %scevgep4, align 1 -define void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 { +define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 { bb: %tmp = icmp sgt i32 %n, 0 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge @@ -48,7 +48,7 @@ ; OPT: {{^}}.lr.ph: ; OPT: %lsr.iv3 = phi i8 addrspace(1)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv3, i64 1 -define void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 { +define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 { bb: %tmp = icmp sgt i32 %n, 0 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge @@ -83,7 +83,7 @@ ; OPT: %lsr.iv2 = phi i8 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ] ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv2, i32 65535 ; OPT: %tmp4 = load i8, i8 addrspace(3)* %scevgep4, align 1 -define void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { +define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { bb: %tmp = icmp sgt i32 %n, 0 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge @@ -122,7 +122,7 @@ ; OPT: {{^}}.lr.ph: ; OPT: %lsr.iv3 = phi i8 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv3, i32 1 -define void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { +define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { bb: %tmp = icmp sgt i32 %n, 0 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge Index: test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll =================================================================== --- test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll +++ test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll @@ -15,7 +15,7 @@ ;CHECK: buffer_store_dword ;CHECK: s_branch [[LOOP_LABEL]] -define void @foo() { +define amdgpu_kernel void @foo() { entry: br label %loop Index: test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll =================================================================== --- test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll +++ test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll @@ -16,7 +16,7 @@ ; CHECK: bb: ; CHECK: inttoptr i32 %lsr.iv.next2 to i8 addrspace(3)* ; CHECK: %c1 = icmp ne i8 addrspace(3)* -define void @local_cmp_user(i32 %arg0) nounwind { +define amdgpu_kernel void @local_cmp_user(i32 %arg0) nounwind { entry: br label %bb11 @@ -47,7 +47,7 @@ ; CHECK: bb: ; CHECK: inttoptr i64 %lsr.iv.next2 to i8 addrspace(1)* ; CHECK: icmp ne i8 addrspace(1)* %t -define void @global_cmp_user(i64 %arg0) nounwind { +define amdgpu_kernel void @global_cmp_user(i64 %arg0) nounwind { entry: br label %bb11 @@ -78,7 +78,7 @@ ; CHECK: bb: ; CHECK: %idxprom = sext i32 %lsr.iv1 to i64 ; CHECK: getelementptr i8, i8 addrspace(1)* %t, i64 %idxprom -define void @global_gep_user(i32 %arg0) nounwind { +define amdgpu_kernel void @global_gep_user(i32 %arg0) nounwind { entry: br label %bb11 @@ -108,7 +108,7 @@ ; CHECK: bb ; CHECK: %p = getelementptr i8, i8 addrspace(1)* %t, i64 %ii.ext -define void @global_sext_scale_user(i32 %arg0) nounwind { +define amdgpu_kernel void @global_sext_scale_user(i32 %arg0) nounwind { entry: br label %bb11 Index: test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll =================================================================== --- test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll +++ test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll @@ -14,7 +14,7 @@ ; CHECK: %scevgep = getelementptr i32, i32 addrspace(3)* %tmp1, i32 4 ; CHECK:%tmp14 = load i32, i32 addrspace(3)* %scevgep -define void @lsr_crash_preserve_addrspace_unknown_type() #0 { +define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 { bb: br label %bb1 Index: test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll =================================================================== --- test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll +++ test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll @@ -6,7 +6,7 @@ ; CHECK: call void @llvm.amdgcn.s.barrier() ; CHECK: call void @llvm.amdgcn.s.barrier() ; CHECK-NOT: br -define void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 { +define amdgpu_kernel void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 { entry: br label %for.body Index: test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll =================================================================== --- test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll +++ test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll @@ -7,7 +7,7 @@ ; CHECK: store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4 ; CHECK: ret void -define void @non_invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) { +define amdgpu_kernel void @non_invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) { entry: %arr = alloca [64 x i32], align 4 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -40,7 +40,7 @@ ; CHECK: br i1 %[[exitcond]] ; CHECK-NOT: icmp eq i32 %{{.*}}, 100 -define void @invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) { +define amdgpu_kernel void @invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) { entry: %arr = alloca [64 x i32], align 4 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -82,7 +82,7 @@ ; CHECK: icmp eq i32 %{{.*}}, 100 ; CHECK: br -define void @too_big(i32 addrspace(1)* nocapture %a, i32 %x) { +define amdgpu_kernel void @too_big(i32 addrspace(1)* nocapture %a, i32 %x) { entry: %arr = alloca [256 x i32], align 4 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -116,7 +116,7 @@ ; CHECK: icmp eq i32 %{{.*}}, 100 ; CHECK: br -define void @dynamic_size_alloca(i32 addrspace(1)* nocapture %a, i32 %n, i32 %x) { +define amdgpu_kernel void @dynamic_size_alloca(i32 addrspace(1)* nocapture %a, i32 %n, i32 %x) { entry: %arr = alloca i32, i32 %n, align 4 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1 Index: test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll =================================================================== --- test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll +++ test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll @@ -3,14 +3,14 @@ ; Check that loop unswitch happened and condition hoisted out of the loop. ; Condition is uniform so all targets should perform unswitching. -; CHECK-LABEL: {{^}}define void @uniform_unswitch +; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch ; CHECK: entry: ; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp ; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456 ; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]] ; CHECK-NEXT: br i1 -define void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) { +define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) { entry: %cmp6 = icmp sgt i32 %n, 0 br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup @@ -42,14 +42,14 @@ ; Check that loop unswitch does not happen if condition is divergent. -; CHECK-LABEL: {{^}}define void @divergent_unswitch +; CHECK-LABEL: {{^}}define amdgpu_kernel void @divergent_unswitch ; CHECK: entry: ; CHECK: icmp ; CHECK: [[IF_COND:%[a-z0-9]+]] = icmp {{.*}} 567890 ; CHECK: br label ; CHECK: br i1 [[IF_COND]] -define void @divergent_unswitch(i32 * nocapture %out, i32 %n) { +define amdgpu_kernel void @divergent_unswitch(i32 * nocapture %out, i32 %n) { entry: %cmp9 = icmp sgt i32 %n, 0 br i1 %cmp9, label %for.body.lr.ph, label %for.cond.cleanup Index: test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll =================================================================== --- test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll +++ test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll @@ -7,7 +7,7 @@ ; CHECK: store i32 ; CHECK-NOT: store i32 ; CHECK: ret -define void @small_loop(i32* nocapture %inArray, i32 %size) nounwind { +define amdgpu_kernel void @small_loop(i32* nocapture %inArray, i32 %size) nounwind { entry: %0 = icmp sgt i32 %size, 0 br i1 %0, label %loop, label %exit Index: test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll =================================================================== --- test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll +++ test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll @@ -9,7 +9,7 @@ ; Simple 3-pair chain with loads and stores -define void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) { +define amdgpu_kernel void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) { ; CHECK-LABEL: @test1_as_3_3_3( ; CHECK: load <2 x double>, <2 x double> addrspace(3)* ; CHECK: load <2 x double>, <2 x double> addrspace(3)* @@ -29,7 +29,7 @@ ret void } -define void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) { +define amdgpu_kernel void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) { ; CHECK-LABEL: @test1_as_3_0_0( ; CHECK: load <2 x double>, <2 x double> addrspace(3)* ; CHECK: load <2 x double>, <2 x double>* @@ -49,7 +49,7 @@ ret void } -define void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) { +define amdgpu_kernel void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) { ; CHECK-LABEL: @test1_as_0_0_3( ; CHECK: load <2 x double>, <2 x double>* ; CHECK: load <2 x double>, <2 x double>* Index: test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll =================================================================== --- test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll +++ test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll @@ -9,7 +9,7 @@ ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 1 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 32 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 33 -define void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) { +define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) { %tmp = sext i32 %y to i64 %tmp1 = sext i32 %x to i64 %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp @@ -42,7 +42,7 @@ ; IR: add i32 %x, 256 ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -define void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) { +define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) { %tmp = sext i32 %y to i64 %tmp1 = sext i32 %x to i64 %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp @@ -74,7 +74,7 @@ ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 255 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16128 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16383 -define void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) { +define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) { %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %x, i32 %y %tmp4 = load float, float addrspace(3)* %tmp2, align 4 %tmp5 = fadd float %tmp4, 0.000000e+00 Index: test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll =================================================================== --- test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll +++ test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll @@ -6,7 +6,7 @@ ; CHECK-LABEL: @slsr_after_reassociate_global_geps_mubuf_max_offset( ; CHECK: [[b1:%[0-9]+]] = getelementptr float, float addrspace(1)* %arr, i64 [[bump:%[0-9]+]] ; CHECK: [[b2:%[0-9]+]] = getelementptr float, float addrspace(1)* [[b1]], i64 [[bump]] -define void @slsr_after_reassociate_global_geps_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) { +define amdgpu_kernel void @slsr_after_reassociate_global_geps_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) { bb: %i2 = shl nsw i32 %i, 1 %j1 = add nsw i32 %i, 1023 @@ -33,7 +33,7 @@ ; CHECK: %tmp = sext i32 %j1 to i64 ; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp ; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp5 -define void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) { +define amdgpu_kernel void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) { bb: %i2 = shl nsw i32 %i, 1 %j1 = add nsw i32 %i, 1024 @@ -61,7 +61,7 @@ ; CHECK: [[B2:%[0-9]+]] = getelementptr float, float addrspace(3)* [[B1]], i32 %i ; CHECK: getelementptr inbounds float, float addrspace(3)* [[B2]], i32 16383 -define void @slsr_after_reassociate_lds_geps_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) { +define amdgpu_kernel void @slsr_after_reassociate_lds_geps_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) { bb: %i2 = shl nsw i32 %i, 1 %j1 = add nsw i32 %i, 16383 @@ -86,7 +86,7 @@ ; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j1 ; CHECK: %j2 = add i32 %j1, %i ; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j2 -define void @slsr_after_reassociate_lds_geps_over_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) { +define amdgpu_kernel void @slsr_after_reassociate_lds_geps_over_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) { bb: %i2 = shl nsw i32 %i, 1 %j1 = add nsw i32 %i, 16384