Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -12,10 +12,10 @@ //===----------------------------------------------------------------------===// class AMDGPUReadPreloadRegisterIntrinsic - : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>; + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; class AMDGPUReadPreloadRegisterIntrinsicNamed - : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>, GCCBuiltin; + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, GCCBuiltin; let TargetPrefix = "r600" in { @@ -47,7 +47,8 @@ // AS 7 is PARAM_I_ADDRESS, used for kernel arguments def int_r600_implicitarg_ptr : GCCBuiltin<"__builtin_r600_implicitarg_ptr">, - Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + Intrinsic<[LLVMQualPointerType], [], + [IntrNoMem, IntrSpeculatable]>; def int_r600_rat_store_typed : // 1st parameter: Data @@ -57,15 +58,15 @@ GCCBuiltin<"__builtin_r600_rat_store_typed">; def int_r600_recipsqrt_ieee : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_r600_recipsqrt_clamped : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_r600_cube : Intrinsic< - [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem] + [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] >; } // End TargetPrefix = "r600" @@ -82,31 +83,36 @@ def int_amdgcn_dispatch_ptr : GCCBuiltin<"__builtin_amdgcn_dispatch_ptr">, - Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + Intrinsic<[LLVMQualPointerType], [], + [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_queue_ptr : GCCBuiltin<"__builtin_amdgcn_queue_ptr">, - Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + Intrinsic<[LLVMQualPointerType], [], + [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_kernarg_segment_ptr : GCCBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, - Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + Intrinsic<[LLVMQualPointerType], [], + [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_implicitarg_ptr : GCCBuiltin<"__builtin_amdgcn_implicitarg_ptr">, - Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + Intrinsic<[LLVMQualPointerType], [], + [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_groupstaticsize : GCCBuiltin<"__builtin_amdgcn_groupstaticsize">, - Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_dispatch_id : GCCBuiltin<"__builtin_amdgcn_dispatch_id">, - Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>; + Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_implicit_buffer_ptr : GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, - Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + Intrinsic<[LLVMQualPointerType], [], + [IntrNoMem, IntrSpeculatable]>; //===----------------------------------------------------------------------===// // Instruction Intrinsics @@ -135,115 +141,129 @@ // second. (0 = first, 1 = second). [llvm_anyfloat_ty, llvm_i1_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], - [IntrNoMem] + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_div_fmas : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], - [IntrNoMem] + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_div_fixup : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem] + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_trig_preop : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_sin : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_cos : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_log_clamp : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_fmul_legacy : GCCBuiltin<"__builtin_amdgcn_fmul_legacy">, - Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem] + Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_rcp : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_rcp_legacy : GCCBuiltin<"__builtin_amdgcn_rcp_legacy">, - Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem] + Intrinsic<[llvm_float_ty], [llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_rsq : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_rsq_legacy : GCCBuiltin<"__builtin_amdgcn_rsq_legacy">, Intrinsic< - [llvm_float_ty], [llvm_float_ty], [IntrNoMem] + [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_rsq_clamp : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_ldexp : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_frexp_mant : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_frexp_exp : Intrinsic< - [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem] + [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable] >; // v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0 // and always uses rtz, so is not suitable for implementing the OpenCL // fract function. It should be ok on VI. def int_amdgcn_fract : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_cvt_pkrtz : Intrinsic< - [llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem] + [llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_class : Intrinsic< - [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem] + [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_fmed3 : GCCBuiltin<"__builtin_amdgcn_fmed3">, Intrinsic<[llvm_anyfloat_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem] + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_cubeid : GCCBuiltin<"__builtin_amdgcn_cubeid">, Intrinsic<[llvm_float_ty], - [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem] + [llvm_float_ty, llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_cubema : GCCBuiltin<"__builtin_amdgcn_cubema">, Intrinsic<[llvm_float_ty], - [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem] + [llvm_float_ty, llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_cubesc : GCCBuiltin<"__builtin_amdgcn_cubesc">, Intrinsic<[llvm_float_ty], - [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem] + [llvm_float_ty, llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_cubetc : GCCBuiltin<"__builtin_amdgcn_cubetc">, Intrinsic<[llvm_float_ty], - [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem] + [llvm_float_ty, llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; // v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz // should be used. def int_amdgcn_sffbh : - Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable] +>; // Fields should mirror atomicrmw @@ -527,7 +547,9 @@ def int_amdgcn_s_getreg : GCCBuiltin<"__builtin_amdgcn_s_getreg">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrReadMem]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty], + [IntrReadMem, IntrSpeculatable] +>; // __builtin_amdgcn_interp_mov , , , // param values: 0 = P10, 1 = P20, 2 = P0 @@ -535,23 +557,24 @@ GCCBuiltin<"__builtin_amdgcn_interp_mov">, Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; // __builtin_amdgcn_interp_p1 , , , +// This intrinsic reads from lds, but the memory values are constant, +// so it behaves like IntrNoMem. def int_amdgcn_interp_p1 : GCCBuiltin<"__builtin_amdgcn_interp_p1">, Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; // This intrinsic reads from lds, but the memory - // values are constant, so it behaves like IntrNoMem. + [IntrNoMem, IntrSpeculatable]>; // __builtin_amdgcn_interp_p2 , , , , def int_amdgcn_interp_p2 : GCCBuiltin<"__builtin_amdgcn_interp_p2">, Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; // See int_amdgcn_v_interp_p1 for why this is - // IntrNoMem. + [IntrNoMem, IntrSpeculatable]>; + // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. // Pixel shaders only: whether the current pixel is live (i.e. not a helper // invocation for derivative computation). @@ -574,48 +597,68 @@ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>; def int_amdgcn_ubfe : Intrinsic<[llvm_anyint_ty], - [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem] + [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_sbfe : Intrinsic<[llvm_anyint_ty], - [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem] + [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_lerp : GCCBuiltin<"__builtin_amdgcn_lerp">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_sad_u8 : GCCBuiltin<"__builtin_amdgcn_sad_u8">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_msad_u8 : GCCBuiltin<"__builtin_amdgcn_msad_u8">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_sad_hi_u8 : GCCBuiltin<"__builtin_amdgcn_sad_hi_u8">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_sad_u16 : GCCBuiltin<"__builtin_amdgcn_sad_u16">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_qsad_pk_u16_u8 : GCCBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">, - Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [IntrNoMem]>; + Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_mqsad_pk_u16_u8 : GCCBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">, - Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [IntrNoMem]>; + Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_mqsad_u32_u8 : GCCBuiltin<"__builtin_amdgcn_mqsad_u32_u8">, - Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_cvt_pk_u8_f32 : GCCBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">, - Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_icmp : Intrinsic<[llvm_i64_ty], [llvm_anyint_ty, LLVMMatchType<0>, llvm_i32_ty], @@ -716,6 +759,7 @@ // Emit 2.5 ulp, no denormal division. Should only be inserted by // pass based on !fpmath metadata. def int_amdgcn_fdiv_fast : Intrinsic< - [llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem] + [llvm_float_ty], [llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; } Index: lib/Target/AMDGPU/R600Intrinsics.td =================================================================== --- lib/Target/AMDGPU/R600Intrinsics.td +++ lib/Target/AMDGPU/R600Intrinsics.td @@ -61,7 +61,7 @@ def int_r600_ddy : TextureIntrinsicFloatInput; def int_r600_dot4 : Intrinsic<[llvm_float_ty], - [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem] + [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] >; } // End TargetPrefix = "r600", isTarget = 1 Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll =================================================================== --- test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -221,10 +221,10 @@ ret void } -attributes #0 = { nounwind readnone } +attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind } -; HSA: attributes #0 = { nounwind readnone } +; HSA: attributes #0 = { nounwind readnone speculatable } ; HSA: attributes #1 = { nounwind } ; HSA: attributes #2 = { nounwind "amdgpu-work-group-id-y" } ; HSA: attributes #3 = { nounwind "amdgpu-work-group-id-z" } Index: test/CodeGen/AMDGPU/zext-lid.ll =================================================================== --- test/CodeGen/AMDGPU/zext-lid.ll +++ test/CodeGen/AMDGPU/zext-lid.ll @@ -4,19 +4,19 @@ ; CHECK-NOT: and_b32 ; OPT-LABEL: @zext_grp_size_128 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !0 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !0 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !0 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !0 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z(), !range !0 define amdgpu_kernel void @zext_grp_size_128(i32 addrspace(1)* nocapture %arg) #0 { bb: - %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = and i32 %tmp, 127 store i32 %tmp1, i32 addrspace(1)* %arg, align 4 - %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() %tmp3 = and i32 %tmp2, 127 %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 - %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() %tmp6 = and i32 %tmp5, 127 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 @@ -24,19 +24,19 @@ } ; OPT-LABEL: @zext_grp_size_32x4x1 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !2 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !3 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !4 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !3 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z(), !range !4 define amdgpu_kernel void @zext_grp_size_32x4x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !0 { bb: - %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = and i32 %tmp, 31 store i32 %tmp1, i32 addrspace(1)* %arg, align 4 - %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() %tmp3 = and i32 %tmp2, 3 %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 - %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() %tmp6 = and i32 %tmp5, 1 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 @@ -44,19 +44,19 @@ } ; OPT-LABEL: @zext_grp_size_512 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !5 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !5 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !5 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !5 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !5 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z(), !range !5 define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 { bb: - %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = and i32 %tmp, 65535 store i32 %tmp1, i32 addrspace(1)* %arg, align 4 - %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() %tmp3 = and i32 %tmp2, 65535 %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 - %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() %tmp6 = and i32 %tmp5, 65535 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 @@ -71,7 +71,8 @@ attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,128" } attributes #1 = { nounwind "amdgpu-flat-work-group-size"="512,512" } -attributes #2 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { nounwind readnone } !0 = !{i32 32, i32 4, i32 1}