diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -44,6 +44,8 @@ //===--- Dwarf Emission Directives -----------------------------------===// SupportsDebugInformation = true; DwarfRegNumForCFI = true; + if (TT.getArch() == Triple::amdgcn) + ExceptionsType = ExceptionHandling::DwarfCFI; } bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const { diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -254,7 +254,7 @@ ; GCN: buffer_store_dword [[BB4_K]] ; GCN-NEXT: s_endpgm ; GCN-NEXT: .Lfunc_end{{[0-9]+}}: -define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) { +define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) #0 { bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 @@ -297,7 +297,7 @@ ; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0{{$}} ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT .Lfunc_end{{[0-9]+}}: -define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) { +define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) #0 { entry: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -65,7 +65,7 @@ define amdgpu_kernel void @fadd( float addrspace(1)* %r, float addrspace(1)* %a, - float addrspace(1)* %b) { + float addrspace(1)* %b) #0 { entry: %a.val = load float, float addrspace(1)* %a %b.val = load float, float addrspace(1)* %b @@ -77,7 +77,7 @@ define amdgpu_kernel void @fsub( float addrspace(1)* %r, float addrspace(1)* %a, - float addrspace(1)* %b) { + float addrspace(1)* %b) #0 { entry: %a.val = load float, float addrspace(1)* %a %b.val = load float, float addrspace(1)* %b @@ -85,3 +85,5 @@ store float %r.val, float addrspace(1)* %r ret void } + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll --- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll @@ -57,5 +57,5 @@ ; Function Attrs: convergent nounwind readnone declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) #4 -attributes #0 = { "no-signed-zeros-fp-math"="true" } +attributes #0 = { nounwind "no-signed-zeros-fp-math"="true" } attributes #4 = { convergent nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -6,7 +6,7 @@ ;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc ;CHECK: s_waitcnt -define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0) @@ -20,7 +20,7 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0) ret <4 x float> %data @@ -31,7 +31,7 @@ ;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc ;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0) ret <4 x float> %data @@ -40,7 +40,7 @@ ;CHECK-LABEL: {{^}}buffer_load_idx: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) ret <4 x float> %data @@ -49,7 +49,7 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) ret <4 x float> %data @@ -58,7 +58,7 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 { main_body: %ofs = add i32 %1, 60 %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) @@ -68,7 +68,7 @@ ;CHECK-LABEL: {{^}}buffer_load_both: ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { +define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) ret <4 x float> %data @@ -78,7 +78,7 @@ ;CHECK: v_mov_b32_e32 v2, v0 ;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { +define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) ret <4 x float> %data @@ -87,7 +87,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x1: ;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { main_body: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) ret float %data @@ -96,7 +96,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x2: ;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { main_body: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) ret <2 x float> %data @@ -105,7 +105,7 @@ ;CHECK-LABEL: {{^}}buffer_load_negative_offset: ;CHECK: v_add_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0 ;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen -define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) #0 { main_body: %ofs.1 = add i32 %ofs, -16 %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i1 0, i1 0) @@ -117,7 +117,7 @@ ; CHECK-LABEL: buffer_load_mmo: ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 -define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) { +define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) #0 { entry: store float 0.0, float addrspace(3)* %lds %val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) @@ -131,7 +131,7 @@ ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) #0 { main_body: %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -156,7 +156,7 @@ ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) #0 { main_body: %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -179,7 +179,7 @@ ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) #0 { main_body: %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 @@ -197,7 +197,7 @@ ;CHECK-NEXT: %bb. ;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) #0 { main_body: %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 @@ -214,7 +214,7 @@ ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) #0 { main_body: %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) @@ -231,7 +231,7 @@ ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) #0 { main_body: %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) @@ -247,7 +247,7 @@ ;CHECK-NEXT: %bb. ;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) #0 { main_body: %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) @@ -263,7 +263,7 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ubyte(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_ubyte(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) %val = uitofp i8 %tmp to float @@ -276,7 +276,7 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ushort(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_ushort(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) %tmp2 = zext i16 %tmp to i32 @@ -290,7 +290,7 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sbyte(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_sbyte(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = sext i8 %tmp to i32 @@ -304,7 +304,7 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sshort(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_sshort(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) %tmp2 = sext i16 %tmp to i32 @@ -317,7 +317,7 @@ ;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ubyte_bitcast(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_ubyte_bitcast(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = zext i8 %tmp to i32 @@ -330,7 +330,7 @@ ;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8 ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ushort_bitcast(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_ushort_bitcast(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = zext i16 %tmp to i32 @@ -343,7 +343,7 @@ ;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sbyte_bitcast(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_sbyte_bitcast(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = sext i8 %tmp to i32 @@ -356,7 +356,7 @@ ;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8 ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sshort_bitcast(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_sshort_bitcast(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = sext i16 %tmp to i32 @@ -370,7 +370,7 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ubyte_mul_bitcast(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_ubyte_mul_bitcast(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = zext i8 %tmp to i32 @@ -385,7 +385,7 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ushort_mul_bitcast(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_ushort_mul_bitcast(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = zext i16 %tmp to i32 @@ -400,7 +400,7 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sbyte_mul_bitcast(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_sbyte_mul_bitcast(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = sext i8 %tmp to i32 @@ -415,7 +415,7 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sshort_mul_bitcast(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_sshort_mul_bitcast(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = sext i16 %tmp to i32 @@ -430,7 +430,7 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_bfe_i32 v{{[0-9]}}, v{{[0-9]}}, 0, 5 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sbyte_type_check(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @buffer_load_sbyte_type_check(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) %tmp2 = zext i8 %tmp to i32 @@ -446,7 +446,7 @@ ; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset: ; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen -define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) #0 { %alloca = alloca i32, addrspace(5) %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32 @@ -458,7 +458,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 4{{$}} ; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s ; CHECK: buffer_load_dword v0, v{{\[}}[[FI]]:[[HI]] -define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) { +define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) #0 { %alloca = alloca i32, addrspace(5) %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -6,7 +6,7 @@ ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc -define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0) @@ -17,7 +17,7 @@ ;CHECK-LABEL: {{^}}buffer_store_immoffs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 -define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) ret void @@ -26,7 +26,7 @@ ;CHECK-LABEL: {{^}}buffer_store_idx: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) #0 { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) ret void @@ -35,7 +35,7 @@ ;CHECK-LABEL: {{^}}buffer_store_ofs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0) ret void @@ -44,7 +44,7 @@ ;CHECK-LABEL: {{^}}buffer_store_both: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) #0 { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0) ret void @@ -54,7 +54,7 @@ ;CHECK: v_mov_b32_e32 v6, v4 ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) #0 { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0) ret void @@ -69,7 +69,7 @@ ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0) @@ -80,7 +80,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x1: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) #0 { main_body: call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void @@ -99,7 +99,7 @@ ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 -define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 { %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -120,7 +120,7 @@ ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} -define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 { %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -139,7 +139,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { +define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) #0 { %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) @@ -150,7 +150,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 -define amdgpu_ps void @buffer_store_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) { +define amdgpu_ps void @buffer_store_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) #0 { %a1 = add i32 %a, 28 %a2 = add i32 %a, 32 %a3 = add i32 %a, 36 @@ -163,7 +163,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged2: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -define amdgpu_ps void @buffer_store_x3_offen_merged2(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) { +define amdgpu_ps void @buffer_store_x3_offen_merged2(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) #0 { %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) @@ -174,7 +174,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged3: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -define amdgpu_ps void @buffer_store_x3_offen_merged3(<4 x i32> inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) { +define amdgpu_ps void @buffer_store_x3_offen_merged3(<4 x i32> inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) #0 { %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) @@ -186,7 +186,7 @@ ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 -define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 { call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) @@ -199,7 +199,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1, <2 x float> %v2) { +define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1, <2 x float> %v2) #0 { call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) ret void @@ -208,7 +208,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged: ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x3_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3) { +define amdgpu_ps void @buffer_store_x3_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3) #0 { call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) @@ -218,7 +218,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged2: ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x3_offset_merged2(<4 x i32> inreg %rsrc, float %v1, <2 x float> %v2) { +define amdgpu_ps void @buffer_store_x3_offset_merged2(<4 x i32> inreg %rsrc, float %v1, <2 x float> %v2) #0 { call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) ret void @@ -227,7 +227,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged3: ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:8 -define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 x float> %v1, float %v2) { +define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 x float> %v1, float %v2) #0 { call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) ret void @@ -237,7 +237,7 @@ ;CHECK-NOT: s_waitcnt ;CHECK-NEXT: %bb. ;CHECK: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 offset:8 -define amdgpu_ps void @buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { +define amdgpu_ps void @buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) #0 { main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 @@ -249,7 +249,7 @@ ;CHECK-NOT: s_waitcnt ;CHECK-NEXT: %bb. ;CHECK: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 offset:16 -define amdgpu_ps void @buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { +define amdgpu_ps void @buffer_store_short(<4 x i32> inreg %rsrc, float %v1) #0 { main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}gs_const: ; GCN-NOT: v_cmpx ; GCN: s_mov_b64 exec, 0 -define amdgpu_gs void @gs_const() { +define amdgpu_gs void @gs_const() #0 { %tmp = icmp ule i32 0, 3 %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00 %c1 = fcmp oge float %tmp1, 0.0 @@ -23,7 +23,7 @@ ; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} ; GFX10: v_cmpx_le_f32_e32 0, v{{[0-9]+}} ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] -define amdgpu_ps void @vcc_implicit_def(float %arg13, float %arg14) { +define amdgpu_ps void @vcc_implicit_def(float %arg13, float %arg14) #0 { %tmp0 = fcmp olt float %arg13, 0.000000e+00 %c1 = fcmp oge float %arg14, 0.0 call void @llvm.amdgcn.kill(i1 %c1) @@ -36,7 +36,7 @@ ; GCN-NEXT: %bb. ; GCN-NEXT: %bb. ; GCN-NEXT: s_endpgm -define amdgpu_gs void @true() { +define amdgpu_gs void @true() #0 { call void @llvm.amdgcn.kill(i1 true) ret void } @@ -44,7 +44,7 @@ ; GCN-LABEL: {{^}}false: ; GCN-NOT: v_cmpx ; GCN: s_mov_b64 exec, 0 -define amdgpu_gs void @false() { +define amdgpu_gs void @false() #0 { call void @llvm.amdgcn.kill(i1 false) ret void } @@ -54,7 +54,7 @@ ; GCN: v_cmp_lt_i32 ; GCN: s_or_b64 s[0:1] ; GCN: s_and_b64 exec, exec, s[0:1] -define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) { +define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) #0 { %c1 = icmp slt i32 %a, %b %c2 = icmp slt i32 %c, %d %x = or i1 %c1, %c2 @@ -67,7 +67,7 @@ ; GCN: v_cmp_lt_i32 ; GCN: s_xor_b64 s[0:1] ; GCN: s_andn2_b64 exec, exec, s[0:1] -define amdgpu_gs void @andn2(i32 %a, i32 %b, i32 %c, i32 %d) { +define amdgpu_gs void @andn2(i32 %a, i32 %b, i32 %c, i32 %d) #0 { %c1 = icmp slt i32 %a, %b %c2 = icmp slt i32 %c, %d %x = xor i1 %c1, %c2 @@ -79,7 +79,7 @@ ; GCN-LABEL: {{^}}oeq: ; GCN: v_cmpx_eq_f32 ; GCN-NOT: s_and -define amdgpu_gs void @oeq(float %a) { +define amdgpu_gs void @oeq(float %a) #0 { %c1 = fcmp oeq float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -88,7 +88,7 @@ ; GCN-LABEL: {{^}}ogt: ; GCN: v_cmpx_lt_f32 ; GCN-NOT: s_and -define amdgpu_gs void @ogt(float %a) { +define amdgpu_gs void @ogt(float %a) #0 { %c1 = fcmp ogt float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -97,7 +97,7 @@ ; GCN-LABEL: {{^}}oge: ; GCN: v_cmpx_le_f32 ; GCN-NOT: s_and -define amdgpu_gs void @oge(float %a) { +define amdgpu_gs void @oge(float %a) #0 { %c1 = fcmp oge float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -106,7 +106,7 @@ ; GCN-LABEL: {{^}}olt: ; GCN: v_cmpx_gt_f32 ; GCN-NOT: s_and -define amdgpu_gs void @olt(float %a) { +define amdgpu_gs void @olt(float %a) #0 { %c1 = fcmp olt float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -115,7 +115,7 @@ ; GCN-LABEL: {{^}}ole: ; GCN: v_cmpx_ge_f32 ; GCN-NOT: s_and -define amdgpu_gs void @ole(float %a) { +define amdgpu_gs void @ole(float %a) #0 { %c1 = fcmp ole float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -124,7 +124,7 @@ ; GCN-LABEL: {{^}}one: ; GCN: v_cmpx_lg_f32 ; GCN-NOT: s_and -define amdgpu_gs void @one(float %a) { +define amdgpu_gs void @one(float %a) #0 { %c1 = fcmp one float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -133,7 +133,7 @@ ; GCN-LABEL: {{^}}ord: ; FIXME: This is absolutely unimportant, but we could use the cmpx variant here. ; GCN: v_cmp_o_f32 -define amdgpu_gs void @ord(float %a) { +define amdgpu_gs void @ord(float %a) #0 { %c1 = fcmp ord float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -142,7 +142,7 @@ ; GCN-LABEL: {{^}}uno: ; FIXME: This is absolutely unimportant, but we could use the cmpx variant here. ; GCN: v_cmp_u_f32 -define amdgpu_gs void @uno(float %a) { +define amdgpu_gs void @uno(float %a) #0 { %c1 = fcmp uno float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -151,7 +151,7 @@ ; GCN-LABEL: {{^}}ueq: ; GCN: v_cmpx_nlg_f32 ; GCN-NOT: s_and -define amdgpu_gs void @ueq(float %a) { +define amdgpu_gs void @ueq(float %a) #0 { %c1 = fcmp ueq float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -160,7 +160,7 @@ ; GCN-LABEL: {{^}}ugt: ; GCN: v_cmpx_nge_f32 ; GCN-NOT: s_and -define amdgpu_gs void @ugt(float %a) { +define amdgpu_gs void @ugt(float %a) #0 { %c1 = fcmp ugt float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -170,7 +170,7 @@ ; SI: v_cmpx_ngt_f32_e32 vcc, -1.0 ; GFX10: v_cmpx_ngt_f32_e32 -1.0 ; GCN-NOT: s_and -define amdgpu_gs void @uge(float %a) { +define amdgpu_gs void @uge(float %a) #0 { %c1 = fcmp uge float %a, -1.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -180,7 +180,7 @@ ; SI: v_cmpx_nle_f32_e32 vcc, -2.0 ; GFX10: v_cmpx_nle_f32_e32 -2.0 ; GCN-NOT: s_and -define amdgpu_gs void @ult(float %a) { +define amdgpu_gs void @ult(float %a) #0 { %c1 = fcmp ult float %a, -2.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -190,7 +190,7 @@ ; SI: v_cmpx_nlt_f32_e32 vcc, 2.0 ; GFX10: v_cmpx_nlt_f32_e32 2.0 ; GCN-NOT: s_and -define amdgpu_gs void @ule(float %a) { +define amdgpu_gs void @ule(float %a) #0 { %c1 = fcmp ule float %a, 2.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -200,7 +200,7 @@ ; SI: v_cmpx_neq_f32_e32 vcc, 0 ; GFX10: v_cmpx_neq_f32_e32 0 ; GCN-NOT: s_and -define amdgpu_gs void @une(float %a) { +define amdgpu_gs void @une(float %a) #0 { %c1 = fcmp une float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) ret void @@ -210,7 +210,7 @@ ; SI: v_cmpx_ngt_f32_e32 vcc, 1.0 ; GFX10: v_cmpx_ngt_f32_e32 1.0 ; GCN-NOT: s_and -define amdgpu_gs void @neg_olt(float %a) { +define amdgpu_gs void @neg_olt(float %a) #0 { %c1 = fcmp olt float %a, 1.0 %c2 = xor i1 %c1, 1 call void @llvm.amdgcn.kill(i1 %c2) @@ -235,7 +235,7 @@ ; GCN: v_cmp_neq_f32_e32 vcc, 0 ; GCN: s_wqm_b64 s[0:1], vcc ; GCN: s_and_b64 exec, exec, s[0:1] -define amdgpu_ps void @wqm(float %a) { +define amdgpu_ps void @wqm(float %a) #0 { %c1 = fcmp une float %a, 0.0 %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1) call void @llvm.amdgcn.kill(i1 %c2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -7,7 +7,7 @@ ;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc{{$}} ;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc{{$}} ;CHECK: s_waitcnt -define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0) %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 1) @@ -26,7 +26,7 @@ ;GFX10: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc{{$}} ;GFX10: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc dlc{{$}} ;CHECK: s_waitcnt -define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_dlc(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_dlc(<4 x i32> inreg) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 4) %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 5) @@ -40,7 +40,7 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 40, i32 0, i32 0) ret <4 x float> %data @@ -50,7 +50,7 @@ ;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4, i32 8188, i32 0) ret <4 x float> %data @@ -59,7 +59,7 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0) ret <4 x float> %data @@ -68,7 +68,7 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 { main_body: %ofs = add i32 %1, 60 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs, i32 0, i32 0) @@ -78,7 +78,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x1: ;CHECK: buffer_load_dword v0, v0, s[0:3], 0 offen ;CHECK: s_waitcnt -define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %ofs) { +define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %ofs) #0 { main_body: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) ret float %data @@ -87,7 +87,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x2: ;CHECK: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen ;CHECK: s_waitcnt -define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %ofs) { +define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %ofs) #0 { main_body: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) ret <2 x float> %data @@ -97,7 +97,7 @@ ;PREGFX10: v_add_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0 ;GFX10: v_add_nc_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], -16, v0 ;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen -define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) #0 { main_body: %ofs.1 = add i32 %ofs, -16 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs.1, i32 0, i32 0) @@ -109,7 +109,7 @@ ; CHECK-LABEL: buffer_load_mmo: ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 -define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) { +define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) #0 { entry: store float 0.0, float addrspace(3)* %lds %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) @@ -123,7 +123,7 @@ ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) #0 { main_body: %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -148,7 +148,7 @@ ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) { +define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) #0 { main_body: %a = shl i32 %inp, 6 %a1 = or i32 %a, 4 @@ -174,7 +174,7 @@ ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) #0 { main_body: %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -197,7 +197,7 @@ ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) { +define amdgpu_ps void @buffer_load_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) #0 { main_body: %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 @@ -216,7 +216,7 @@ ;CHECK-NEXT: v_lshlrev_b32_e32 v{{[0-9]}}, 4, v0 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) { +define amdgpu_ps void @buffer_load_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) #0 { main_body: %a = shl i32 %inp, 4 %a1 = add i32 %a, 4 @@ -236,7 +236,7 @@ ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) #0 { main_body: %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0) @@ -253,7 +253,7 @@ ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) #0 { main_body: %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) @@ -270,7 +270,7 @@ ;CHECK: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc ;CHECK: buffer_load_dword v6, off, s[0:3], 0 slc ;CHECK: s_waitcnt -define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) #0 { main_body: %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0) %data_glc = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> %0, i32 0, i32 0, i32 1) @@ -290,7 +290,7 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @raw_buffer_load_ubyte(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @raw_buffer_load_ubyte(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %tmp2 = zext i8 %tmp to i32 @@ -304,7 +304,7 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @raw_buffer_load_i16(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @raw_buffer_load_i16(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %tmp2 = zext i16 %tmp to i32 @@ -318,7 +318,7 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @raw_buffer_load_sbyte(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @raw_buffer_load_sbyte(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %tmp2 = sext i8 %tmp to i32 @@ -332,7 +332,7 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @raw_buffer_load_sshort(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @raw_buffer_load_sshort(<4 x i32> inreg %rsrc) #0 { main_body: %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %tmp2 = sext i16 %tmp to i32 @@ -345,7 +345,7 @@ ;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], off, s[0:3], 0 ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b16 v0, [[VAL]] -define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr) { +define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr) #0 { main_body: %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) store half %val, half addrspace(3)* %ptr @@ -357,7 +357,7 @@ ;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], off, s[0:3], 0 ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b32 v0, [[VAL]] -define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr) { +define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr) #0 { main_body: %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) store <2 x half> %val, <2 x half> addrspace(3)* %ptr @@ -369,7 +369,7 @@ ;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0 ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b64 v0, [[VAL]] -define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr) { +define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr) #0 { main_body: %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) store <4 x half> %val, <4 x half> addrspace(3)* %ptr @@ -381,7 +381,7 @@ ;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], off, s[0:3], 0 ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b32 v0, [[VAL]] -define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr) { +define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr) #0 { main_body: %val = call <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) store <2 x i16> %val, <2 x i16> addrspace(3)* %ptr @@ -393,7 +393,7 @@ ;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0 ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b64 v0, [[VAL]] -define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr) { +define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr) #0 { main_body: %val = call <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) store <4 x i16> %val, <4 x i16> addrspace(3)* %ptr @@ -405,7 +405,7 @@ ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 ;CHECK: s_waitcnt -define amdgpu_ps void @raw_buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @raw_buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) #0 { main_body: %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0) @@ -427,7 +427,7 @@ ;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 ;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 ;CHECK: s_waitcnt -define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) { +define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) #0 { main_body: %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 8) %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 8) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -6,7 +6,7 @@ ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc -define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 { main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1) @@ -17,7 +17,7 @@ ;CHECK-LABEL: {{^}}buffer_store_immoffs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 -define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 { main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) ret void @@ -26,7 +26,7 @@ ;CHECK-LABEL: {{^}}buffer_store_ofs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 { main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) ret void @@ -41,7 +41,7 @@ ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 { main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0) @@ -52,7 +52,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x1: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) { +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) #0 { main_body: call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -71,7 +71,7 @@ ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 -define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 { %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -91,7 +91,7 @@ ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 -define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 { %a = shl i32 %inp, 6 %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -114,7 +114,7 @@ ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} -define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 { %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -133,7 +133,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_and: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { +define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) #0 { %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) @@ -144,7 +144,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_or: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 -define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) { +define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) #0 { %a = shl i32 %inp, 4 %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 @@ -157,7 +157,7 @@ ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 -define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 { call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) @@ -170,7 +170,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { +define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) #0 { call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0) ret void @@ -181,7 +181,7 @@ ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ;CHECK: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc ;CHECK: buffer_store_dword v6, off, s[0:3], 0 slc -define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) { +define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) #0 { main_body: call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 1) @@ -194,7 +194,7 @@ ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} ;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { +define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) #0 { main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 @@ -207,7 +207,7 @@ ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} ;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { +define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) #0 { main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 @@ -220,7 +220,7 @@ ;CHECK-NOT: v0 ;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) { +define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) #0 { main_body: %trunc = trunc i32 %v1 to i16 %cast = bitcast i16 %trunc to half @@ -231,7 +231,7 @@ ;CHECK-LABEL: {{^}}buffer_store_v2f16: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %offset) { +define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %offset) #0 { main_body: call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -251,7 +251,7 @@ ;CHECK-NOT: v0 ;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) { +define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) #0 { main_body: %trunc = trunc i32 %v1 to i16 call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %rsrc, i32 0, i32 0, i32 0) @@ -261,7 +261,7 @@ ;CHECK-LABEL: {{^}}buffer_store_v2i16: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data, i32 %offset) { +define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data, i32 %offset) #0 { main_body: call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -280,7 +280,7 @@ ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 -define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 { call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) @@ -297,7 +297,7 @@ ;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:16 ;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 ;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 -define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 { call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8) call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 8) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -7,7 +7,7 @@ ;CHECK: buffer_load_dword ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 -define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) #1 { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -23,7 +23,7 @@ ;CHECK: buffer_load_dword ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 -define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) #1 { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -43,7 +43,7 @@ ;CHECK: buffer_store_dword ;CHECK-NOT; s_wqm_b64 exec, exec ;CHECK: v_add_f32_e32 -define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) #1 { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -65,7 +65,7 @@ ;CHECK: buffer_store_dword ;CHECK; s_wqm_b64 exec, exec ;CHECK: v_add_f32_e32 -define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) #1 { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -87,7 +87,7 @@ ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK-NOT: s_wqm_b64 -define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) #1 { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) @@ -109,7 +109,7 @@ ;CHECK: %IF ;CHECK: buffer_load ;CHECK: buffer_load -define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) { +define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) #1 { main_body: %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %IF, label %ELSE @@ -145,7 +145,7 @@ ;CHECK-NOT: s_and_b64 exec ;CHECK: buffer_load ;CHECK: buffer_load -define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) { +define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) #1 { main_body: %c.bc = bitcast i32 %c to float %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll @@ -6,7 +6,7 @@ ;CHECK: buffer_load_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc ;CHECK: s_waitcnt -define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) %data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1) @@ -20,7 +20,7 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i32 0, i32 0) ret <4 x float> %data @@ -30,7 +30,7 @@ ;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc ;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 4, i32 8188, i32 0) ret <4 x float> %data @@ -39,7 +39,7 @@ ;CHECK-LABEL: {{^}}buffer_load_idx: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0) ret <4 x float> %data @@ -48,7 +48,7 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0) ret <4 x float> %data @@ -57,7 +57,7 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 { main_body: %ofs = add i32 %1, 60 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 0) @@ -67,7 +67,7 @@ ;CHECK-LABEL: {{^}}buffer_load_both: ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { +define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0) ret <4 x float> %data @@ -77,7 +77,7 @@ ;CHECK: v_mov_b32_e32 v2, v0 ;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { +define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0) ret <4 x float> %data @@ -86,7 +86,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x1: ;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { main_body: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) ret float %data @@ -95,7 +95,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x2: ;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { main_body: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) ret <2 x float> %data @@ -104,7 +104,7 @@ ;CHECK-LABEL: {{^}}buffer_load_negative_offset: ;CHECK: v_add_{{[iu]}}32_e32 {{v[0-9]+}}, vcc, -16, v0 ;CHECK: buffer_load_dwordx4 v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen -define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) #0 { main_body: %ofs.1 = add i32 %ofs, -16 %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i32 0, i32 0) @@ -116,7 +116,7 @@ ; CHECK-LABEL: buffer_load_mmo: ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 -define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) { +define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) #0 { entry: store float 0.0, float addrspace(3)* %lds %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) @@ -130,7 +130,7 @@ ;CHECK: buffer_load_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: buffer_load_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc ;CHECK: s_waitcnt -define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) { +define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) #0 { main_body: %data = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) %data_glc = call <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1) @@ -150,7 +150,7 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @struct_buffer_load_ubyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps float @struct_buffer_load_ubyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { main_body: %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) %tmp2 = zext i8 %tmp to i32 @@ -164,7 +164,7 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @struct_buffer_load_ushort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps float @struct_buffer_load_ushort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { main_body: %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) %tmp2 = zext i16 %tmp to i32 @@ -178,7 +178,7 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @struct_buffer_load_sbyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps float @struct_buffer_load_sbyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { main_body: %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) %tmp2 = sext i8 %tmp to i32 @@ -192,7 +192,7 @@ ;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @struct_buffer_load_sshort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +define amdgpu_ps float @struct_buffer_load_sshort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { main_body: %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) %tmp2 = sext i16 %tmp to i32 @@ -205,7 +205,7 @@ ;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b16 v0, [[VAL]] -define amdgpu_ps void @struct_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr, i32 %idx) { +define amdgpu_ps void @struct_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr, i32 %idx) #0 { main_body: %val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) store half %val, half addrspace(3)* %ptr @@ -217,7 +217,7 @@ ;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b32 v0, [[VAL]] -define amdgpu_ps void @struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr, i32 %idx) { +define amdgpu_ps void @struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr, i32 %idx) #0 { main_body: %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) store <2 x half> %val, <2 x half> addrspace(3)* %ptr @@ -229,7 +229,7 @@ ;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b64 v0, [[VAL]] -define amdgpu_ps void @struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr, i32 %idx) { +define amdgpu_ps void @struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr, i32 %idx) #0 { main_body: %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) store <4 x half> %val, <4 x half> addrspace(3)* %ptr @@ -241,7 +241,7 @@ ;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b16 v0, [[VAL]] -define amdgpu_ps void @struct_buffer_load_i16(<4 x i32> inreg %rsrc, i16 addrspace(3)* %ptr, i32 %idx) { +define amdgpu_ps void @struct_buffer_load_i16(<4 x i32> inreg %rsrc, i16 addrspace(3)* %ptr, i32 %idx) #0 { main_body: %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) store i16 %val, i16 addrspace(3)* %ptr @@ -253,7 +253,7 @@ ;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b32 v0, [[VAL]] -define amdgpu_ps void @struct_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr, i32 %idx) { +define amdgpu_ps void @struct_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr, i32 %idx) #0 { main_body: %val = call <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) store <2 x i16> %val, <2 x i16> addrspace(3)* %ptr @@ -265,7 +265,7 @@ ;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: ds_write_b64 v0, [[VAL]] -define amdgpu_ps void @struct_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr, i32 %idx) { +define amdgpu_ps void @struct_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr, i32 %idx) #0 { main_body: %val = call <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) store <4 x i16> %val, <4 x i16> addrspace(3)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -6,7 +6,7 @@ ;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen ;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc -define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 { main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) @@ -17,7 +17,7 @@ ;CHECK-LABEL: {{^}}buffer_store_immoffs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 -define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 { main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0) ret void @@ -26,7 +26,7 @@ ;CHECK-LABEL: {{^}}buffer_store_idx: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) #0 { main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) ret void @@ -35,7 +35,7 @@ ;CHECK-LABEL: {{^}}buffer_store_ofs: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 { main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0) ret void @@ -44,7 +44,7 @@ ;CHECK-LABEL: {{^}}buffer_store_both: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) #0 { main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0) ret void @@ -54,7 +54,7 @@ ;CHECK: v_mov_b32_e32 v6, v4 ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) #0 { main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0) ret void @@ -69,7 +69,7 @@ ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 { main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0) @@ -80,7 +80,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x1: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) #0 { main_body: call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void @@ -100,7 +100,7 @@ ;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen ;CHECK: buffer_store_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: buffer_store_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc -define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) { +define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) #0 { main_body: call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) @@ -113,7 +113,7 @@ ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} ;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) #0 { main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 @@ -126,7 +126,7 @@ ;CHECK-NEXT: v_cvt_f16_f32_e32 v{{[0-9]}}, v{{[0-9]}} ;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1, i32 %index) #0 { %v2 = fptrunc float %v1 to half call void @llvm.amdgcn.struct.buffer.store.f16(half %v2, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void @@ -135,7 +135,7 @@ ;CHECK-LABEL: {{^}}struct_buffer_store_v2f16: ;CHECK-NEXT: %bb. ;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen -define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %v1, i32 %index) { +define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %v1, i32 %index) #0 { call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } @@ -143,7 +143,7 @@ ;CHECK-LABEL: {{^}}struct_buffer_store_v4f16: ;CHECK-NEXT: %bb. ;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen -define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %v1, i32 %index) { +define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %v1, i32 %index) #0 { call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } @@ -153,7 +153,7 @@ ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} ;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen ;CHECK-NEXT: s_endpgm -define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1, i32 %index) #0 { main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 @@ -164,7 +164,7 @@ ;CHECK-LABEL: {{^}}struct_buffer_store_vif16: ;CHECK-NEXT: %bb. ;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen -define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16> %v1, i32 %index) { +define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16> %v1, i32 %index) #0 { call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } @@ -172,7 +172,7 @@ ;CHECK-LABEL: {{^}}struct_buffer_store_v4i16: ;CHECK-NEXT: %bb. ;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen -define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %v1, i32 %index) { +define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %v1, i32 %index) #0 { call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll --- a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll +++ b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll @@ -6,6 +6,8 @@ ; GCN: .Lfunc_begin0: ; GCN-NEXT: .file 0 ; GCN-NEXT: .loc 0 3 0 ; /tmp/dbg.cl:3:0 +; GCN-NEXT: .cfi_sections .debug_frame +; GCN-NEXT: .cfi_startproc ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp0: @@ -24,6 +26,7 @@ ; GCN-LABEL: split_v4f32_multi_arg: ; GCN: .Lfunc_begin1: ; GCN-NEXT: .loc 0 7 0 ; /tmp/dbg.cl:7:0 +; GCN-NEXT: .cfi_startproc ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp2: @@ -56,6 +59,7 @@ ; GCN-LABEL: split_v4f16_arg: ; GCN: .Lfunc_begin2: ; GCN-NEXT: .loc 0 11 0 is_stmt 1 ; /tmp/dbg.cl:11:0 +; GCN-NEXT: .cfi_startproc ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp8: @@ -72,6 +76,7 @@ ; GCN-LABEL: split_f64_arg: ; GCN: .Lfunc_begin3: ; GCN-NEXT: .loc 0 15 0 ; /tmp/dbg.cl:15:0 +; GCN-NEXT: .cfi_startproc ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp10: @@ -88,6 +93,7 @@ ; GCN-LABEL: split_v2f64_arg: ; GCN: .Lfunc_begin4: ; GCN-NEXT: .loc 0 19 0 ; /tmp/dbg.cl:19:0 +; GCN-NEXT: .cfi_startproc ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp12: @@ -106,6 +112,7 @@ ; GCN-LABEL: split_i64_arg: ; GCN: .Lfunc_begin5: ; GCN-NEXT: .loc 0 23 0 ; /tmp/dbg.cl:23:0 +; GCN-NEXT: .cfi_startproc ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp14: @@ -122,6 +129,7 @@ ; GCN-LABEL: split_ptr_arg: ; GCN: .Lfunc_begin6: ; GCN-NEXT: .loc 0 27 0 ; /tmp/dbg.cl:27:0 +; GCN-NEXT: .cfi_startproc ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: .Ltmp16: diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -5,7 +5,7 @@ ; ;CHECK-LABEL: {{^}}test1: ;CHECK-NOT: s_wqm -define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) { +define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) #1 { main_body: %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) @@ -47,7 +47,7 @@ ;CHECK: store ;CHECK-NOT: exec ;CHECK: .size test3 -define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) { +define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) #1 { main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex.1 = bitcast <4 x float> %tex to <4 x i32> @@ -98,7 +98,7 @@ ;CHECK: s_wqm_b64 exec, exec ;CHECK: image_sample ;CHECK: image_sample -define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { +define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) #1 { main_body: %c.1 = mul i32 %c, %d @@ -120,7 +120,7 @@ ; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this ; does not happen - the v_add should write the return reg directly. ;CHECK-NOT: v_mov_b32_e32 -define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) #1 { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -136,7 +136,7 @@ ;CHECK: buffer_load_dword ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 -define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) #1 { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -154,7 +154,7 @@ ;CHECK: buffer_load_dword ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 -define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) #1 { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -170,7 +170,7 @@ ;CHECK: buffer_load_dword ;CHECK: buffer_load_dword ;CHECK: v_add_{{[iu]}}32_e32 -define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) #1 { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -192,7 +192,7 @@ ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK: v_add_f32_e32 -define amdgpu_ps float @test_wwm3(i32 inreg %idx) { +define amdgpu_ps float @test_wwm3(i32 inreg %idx) #1 { main_body: ; use mbcnt to make sure the branch is divergent %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -221,7 +221,7 @@ ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK-NEXT: v_mov_b32_e32 -define amdgpu_ps float @test_wwm4(i32 inreg %idx) { +define amdgpu_ps float @test_wwm4(i32 inreg %idx) #1 { main_body: ; use mbcnt to make sure the branch is divergent %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -250,7 +250,7 @@ ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK: s_wqm_b64 exec, exec -define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) #1 { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) @@ -276,7 +276,7 @@ ;VI-CHECK: flat_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG2]] -define amdgpu_ps float @test_wwm6_then() { +define amdgpu_ps float @test_wwm6_then() #1 { main_body: %src0 = load volatile float, float addrspace(1)* undef ; use mbcnt to make sure the branch is divergent @@ -309,7 +309,7 @@ ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword ;CHECK: s_mov_b64 exec, [[ORIG2]] -define amdgpu_ps float @test_wwm6_loop() { +define amdgpu_ps float @test_wwm6_loop() #1 { main_body: %src0 = load volatile float, float addrspace(1)* undef ; use mbcnt to make sure the branch is divergent @@ -339,7 +339,7 @@ ;CHECK: s_not_b64 exec, exec ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 ;CHECK: v_add_{{[iu]}}32_e32 -define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) { +define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) #1 { main_body: %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) %src.0 = bitcast float %src to i32 @@ -357,7 +357,7 @@ ;CHECK: s_wqm_b64 exec, exec ;CHECK: buffer_load_dword ;CHECK: buffer_load_dword -define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) #1 { main_body: %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %src1.0 = bitcast float %src1 to i32 @@ -389,7 +389,7 @@ ;CHECK: %IF ;CHECK: image_sample ;CHECK: image_sample -define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) #1 { main_body: %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %IF, label %ELSE @@ -432,7 +432,7 @@ ;CHECK: s_or_b64 exec, exec, ;CHECK: v_mov_b32_e32 v0 ;CHECK: ; return -define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) #1 { main_body: %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %ELSE, label %IF @@ -468,7 +468,7 @@ ;CHECK: store ;CHECK: s_wqm_b64 exec, exec ;CHECK: v_cmp -define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { +define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) #1 { main_body: %idx.1 = extractelement <3 x i32> %idx, i32 0 %data.1 = extractelement <2 x float> %data, i32 0 @@ -512,7 +512,7 @@ ;CHECK: image_sample ;CHECK-DAG: v_cmp ;CHECK-DAG: store -define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) { +define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) #1 { main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 @@ -550,7 +550,7 @@ ;CHECK: %END ;CHECK: image_sample ;CHECK: image_sample -define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) { +define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) #1 { main_body: %cond = icmp eq i32 %y, 0 br i1 %cond, label %IF, label %END @@ -582,7 +582,7 @@ ;CHECK: buffer_store_dword ;CHECK: s_mov_b64 exec, [[SAVE]] ;CHECK: image_sample -define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { +define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) #1 { main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %idx.0 = extractelement <2 x i32> %idx, i32 0 @@ -615,7 +615,7 @@ ; CHECK: buffer_store_dword ; CHECK-NOT: wqm ; CHECK: v_cmpx_ -define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) #1 { main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 @@ -803,7 +803,7 @@ ;CHECK: s_or_saveexec_b64 {{.*}}, -1 ;CHECK: ds_swizzle ; -define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) #1 { main_body: %c.bc = bitcast i32 %c to float %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 diff --git a/llvm/test/DebugInfo/AMDGPU/cfi.ll b/llvm/test/DebugInfo/AMDGPU/cfi.ll new file mode 100644 --- /dev/null +++ b/llvm/test/DebugInfo/AMDGPU/cfi.ll @@ -0,0 +1,30 @@ +; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - %s | llvm-dwarfdump -debug-frame - | FileCheck %s + +; CHECK: .debug_frame contents: +; CHECK: 00000000 0000000c ffffffff CIE +; CHECK-NEXT: Version: 4 +; CHECK-NEXT: Augmentation: "" +; CHECK-NEXT: Address size: 8 +; CHECK-NEXT: Segment desc size: 0 +; CHECK-NEXT: Code alignment factor: 4 +; CHECK-NEXT: Data alignment factor: 4 +; CHECK-NEXT: Return address column: 16 +; CHECK-EMPTY: +; CHECK-NEXT: DW_CFA_nop: +; CHECK-EMPTY: +; CHECK-NEXT: 00000010 {{[0-9]+}} 00000000 FDE cie=00000000 pc=00000000...{{[0-9]+}} +; CHECK: .eh_frame contents: + +define void @func() #0 { + ret void +} + +attributes #0 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} + +!0 = !{i32 2, !"Dwarf Version", i32 4} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, emissionKind: FullDebug) +!3 = !DIFile(filename: "file", directory: "dir") diff --git a/llvm/test/MC/ELF/AMDGPU/cfi.s b/llvm/test/MC/ELF/AMDGPU/cfi.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/ELF/AMDGPU/cfi.s @@ -0,0 +1,57 @@ +// RUN: llvm-mc -filetype=asm -mcpu=gfx900 -triple amdgcn-amd-amdhsa %s -o - | FileCheck --check-prefix=ASM %s +// RUN: llvm-mc -filetype=obj -mcpu=gfx900 -triple amdgcn-amd-amdhsa %s -o - | llvm-readobj -S --sr --sd | FileCheck --check-prefix=READOBJ %s + +f: + .cfi_sections .debug_frame + .cfi_startproc + s_nop 0 + .cfi_endproc + +// ASM: f: +// ASM-NEXT: .cfi_sections .debug_frame +// FIXME Why emit an extra empty line? +// ASM-EMPTY: +// ASM-NEXT: .cfi_startproc +// ASM-NEXT: s_nop 0 +// FIXME Why emit an extra empty line? +// ASM-EMPTY: +// ASM-NEXT: .cfi_endproc + +// READOBJ: Section { +// READOBJ: Name: .debug_frame +// READOBJ-NEXT: Type: SHT_PROGBITS +// READOBJ-NEXT: Flags [ +// READOBJ-NEXT: ] +// READOBJ-NEXT: Address: 0x0 +// READOBJ-NEXT: Offset: 0x48 +// READOBJ-NEXT: Size: 40 +// READOBJ-NEXT: Link: 0 +// READOBJ-NEXT: Info: 0 +// READOBJ-NEXT: AddressAlignment: 8 +// READOBJ-NEXT: EntrySize: 0 +// READOBJ-NEXT: Relocations [ +// READOBJ-NEXT: ] +// READOBJ-NEXT: SectionData ( +// READOBJ-NEXT: 0000: 0C000000 FFFFFFFF 04000800 04041000 +// READOBJ-NEXT: 0010: 14000000 00000000 00000000 00000000 +// READOBJ-NEXT: 0020: 04000000 00000000 +// READOBJ-NEXT: ) +// READOBJ-NEXT: } + +// READOBJ: Section { +// READOBJ: Name: .rela.debug_frame +// READOBJ-NEXT: Type: SHT_RELA +// READOBJ-NEXT: Flags [ +// READOBJ-NEXT: ] +// READOBJ-NEXT: Address: 0x0 +// READOBJ-NEXT: Offset: +// READOBJ-NEXT: Size: 48 +// READOBJ-NEXT: Link: +// READOBJ-NEXT: Info: +// READOBJ-NEXT: AddressAlignment: 8 +// READOBJ-NEXT: EntrySize: 24 +// READOBJ-NEXT: Relocations [ +// READOBJ-NEXT: 0x14 R_AMDGPU_ABS32 .debug_frame 0x0 +// READOBJ-NEXT: 0x18 R_AMDGPU_ABS64 .text 0x0 +// READOBJ-NEXT: ] +// READOBJ: } diff --git a/llvm/test/MC/ELF/AMDGPU/lit.local.cfg b/llvm/test/MC/ELF/AMDGPU/lit.local.cfg new file mode 100644 --- /dev/null +++ b/llvm/test/MC/ELF/AMDGPU/lit.local.cfg @@ -0,0 +1,3 @@ +# We have to reset config.unsupported here because the parent directory is +# predicated on 'X86'. +config.unsupported = not 'AMDGPU' in config.root.targets