Index: llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.ll @@ -0,0 +1,167 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s + +; This used to crash due to an unallocatable bundle being produced to +; hint register allocation to form soft clauses. + +define amdgpu_vs void @main(<8 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %arg, <4 x i32> inreg %arg1, i32 %arg2) #0 { +; CHECK-LABEL: main: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_mov_b32 s20, s1 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_mov_b32 s21, s2 +; CHECK-NEXT: s_load_dwordx8 s[52:59], s[0:1], 0x4c0 +; CHECK-NEXT: s_mov_b32 s2, 0.5 +; CHECK-NEXT: s_mov_b32 s22, s3 +; CHECK-NEXT: s_mov_b32 s3, s2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_dwordx8 s[44:51], s[0:1], 0x440 +; CHECK-NEXT: s_load_dwordx4 s[24:27], s[0:1], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v6, s3 +; CHECK-NEXT: v_mov_b32_e32 v5, s2 +; CHECK-NEXT: s_movk_i32 s2, 0x4b0 +; CHECK-NEXT: s_mov_b32 s3, s1 +; CHECK-NEXT: s_load_dwordx8 s[36:43], s[0:1], 0x480 +; CHECK-NEXT: s_load_dwordx4 s[28:31], s[2:3], 0x0 +; CHECK-NEXT: s_movk_i32 s2, 0x4f0 +; CHECK-NEXT: s_mov_b32 s23, s4 +; CHECK-NEXT: s_load_dwordx4 s[88:91], s[2:3], 0x0 +; CHECK-NEXT: s_movk_i32 s2, 0x530 +; CHECK-NEXT: s_movk_i32 s4, 0x5b0 +; CHECK-NEXT: s_mov_b32 s5, s1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_dwordx8 s[68:75], s[0:1], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[76:83], s[0:1], 0x5c0 +; CHECK-NEXT: s_mov_b32 s8, s1 +; CHECK-NEXT: s_mov_b32 s9, s1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_writelane_b32 v15, s52, 0 +; CHECK-NEXT: s_mov_b32 s10, s1 +; CHECK-NEXT: s_mov_b32 s11, s1 +; CHECK-NEXT: s_mov_b32 s12, s1 +; CHECK-NEXT: s_mov_b32 s13, s1 +; CHECK-NEXT: v_writelane_b32 v15, s53, 1 +; CHECK-NEXT: s_mov_b32 s14, s1 +; CHECK-NEXT: s_mov_b32 s15, s1 +; CHECK-NEXT: s_mov_b32 s16, s1 +; CHECK-NEXT: s_mov_b32 s17, s1 +; CHECK-NEXT: v_writelane_b32 v15, s54, 2 +; CHECK-NEXT: s_mov_b32 s18, s1 +; CHECK-NEXT: s_mov_b32 s19, s1 +; CHECK-NEXT: v_writelane_b32 v15, s55, 3 +; CHECK-NEXT: v_writelane_b32 v15, s56, 4 +; CHECK-NEXT: v_writelane_b32 v15, s57, 5 +; CHECK-NEXT: v_writelane_b32 v15, s58, 6 +; CHECK-NEXT: v_writelane_b32 v15, s59, 7 +; CHECK-NEXT: s_load_dwordx8 s[52:59], s[0:1], 0x500 +; CHECK-NEXT: s_load_dwordx4 s[92:95], s[2:3], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[60:67], s[0:1], 0x540 +; CHECK-NEXT: s_movk_i32 s0, 0x5f0 +; CHECK-NEXT: s_movk_i32 s2, 0x570 +; CHECK-NEXT: s_load_dwordx4 s[96:99], s[4:5], 0x0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[84:87], s[2:3], 0x0 +; CHECK-NEXT: image_sample_lz v7, v[5:6], s[12:19], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; CHECK-NEXT: image_sample_lz v8, v[5:6], s[44:51], s[24:27] dmask:0x1 dim:SQ_RSRC_IMG_2D +; CHECK-NEXT: image_sample_lz v9, v[5:6], s[36:43], s[28:31] dmask:0x1 dim:SQ_RSRC_IMG_2D +; CHECK-NEXT: v_readlane_b32 s8, v15, 0 +; CHECK-NEXT: v_readlane_b32 s9, v15, 1 +; CHECK-NEXT: v_readlane_b32 s10, v15, 2 +; CHECK-NEXT: v_readlane_b32 s11, v15, 3 +; CHECK-NEXT: v_readlane_b32 s12, v15, 4 +; CHECK-NEXT: v_readlane_b32 s13, v15, 5 +; CHECK-NEXT: v_readlane_b32 s14, v15, 6 +; CHECK-NEXT: v_readlane_b32 s15, v15, 7 +; CHECK-NEXT: s_nop 4 +; CHECK-NEXT: image_sample_lz v10, v[5:6], s[8:15], s[88:91] dmask:0x1 dim:SQ_RSRC_IMG_2D +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: image_sample_lz v11, v[5:6], s[52:59], s[92:95] dmask:0x1 dim:SQ_RSRC_IMG_2D +; CHECK-NEXT: image_sample_lz v12, v[5:6], s[68:75], s[96:99] dmask:0x1 dim:SQ_RSRC_IMG_2D +; CHECK-NEXT: image_sample_lz v14, v[5:6], s[76:83], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; CHECK-NEXT: image_sample_lz v13, v[5:6], s[60:67], s[84:87] dmask:0x1 dim:SQ_RSRC_IMG_2D +; CHECK-NEXT: buffer_load_format_xyzw v[1:4], v0, s[20:23], 0 idxen +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: v_max_f32_e32 v0, v7, v8 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: v_max3_f32 v0, v0, v9, v10 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_add_f32_e32 v1, 0xbf6dd2f2, v12 +; CHECK-NEXT: v_add_f32_e32 v3, 0xbef8d4fe, v14 +; CHECK-NEXT: v_max3_f32 v0, v0, v11, v13 +; CHECK-NEXT: v_max3_f32 v0, v0, v1, v3 +; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x3d4ccccd, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; CHECK-NEXT: exp pos0 v0, v2, v0, v0 done vm +; CHECK-NEXT: s_endpgm +bb: + %i = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %arg1, i32 %arg2, i32 0, i32 0, i32 0) #4 + %i3 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 5.000000e-01, float 5.000000e-01, <8 x i32> undef, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0) #4 + %i4 = extractelement <4 x float> %i3, i32 0 + %i5 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(6)* %arg, i32 34 + %i6 = load <8 x i32>, <8 x i32> addrspace(6)* %i5, align 32 + %i7 = load <4 x i32>, <4 x i32> addrspace(6)* undef, align 16 + %i8 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 5.000000e-01, float 5.000000e-01, <8 x i32> %i6, <4 x i32> %i7, i1 false, i32 0, i32 0) #4 + %i9 = extractelement <4 x float> %i8, i32 0 + %i10 = call float @llvm.maxnum.f32(float %i4, float %i9) #4 + %i11 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(6)* %arg, i32 36 + %i12 = load <8 x i32>, <8 x i32> addrspace(6)* %i11, align 32 + %i13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(6)* null, i32 75 + %i14 = load <4 x i32>, <4 x i32> addrspace(6)* %i13, align 16 + %i15 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 5.000000e-01, float 5.000000e-01, <8 x i32> %i12, <4 x i32> %i14, i1 false, i32 0, i32 0) #4 + %i16 = extractelement <4 x float> %i15, i32 0 + %i17 = call float @llvm.maxnum.f32(float %i10, float %i16) #4 + %i18 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(6)* %arg, i32 38 + %i19 = load <8 x i32>, <8 x i32> addrspace(6)* %i18, align 32 + %i20 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(6)* null, i32 79 + %i21 = load <4 x i32>, <4 x i32> addrspace(6)* %i20, align 16 + %i22 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 5.000000e-01, float 5.000000e-01, <8 x i32> %i19, <4 x i32> %i21, i1 false, i32 0, i32 0) #4 + %i23 = extractelement <4 x float> %i22, i32 0 + %i24 = call float @llvm.maxnum.f32(float %i17, float %i23) #4 + %i25 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(6)* %arg, i32 40 + %i26 = load <8 x i32>, <8 x i32> addrspace(6)* %i25, align 32 + %i27 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(6)* null, i32 83 + %i28 = load <4 x i32>, <4 x i32> addrspace(6)* %i27, align 16 + %i29 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 5.000000e-01, float 5.000000e-01, <8 x i32> %i26, <4 x i32> %i28, i1 false, i32 0, i32 0) #4 + %i30 = extractelement <4 x float> %i29, i32 0 + %i31 = call float @llvm.maxnum.f32(float %i24, float %i30) #4 + %i32 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(6)* %arg, i32 42 + %i33 = load <8 x i32>, <8 x i32> addrspace(6)* %i32, align 32 + %i34 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(6)* null, i32 87 + %i35 = load <4 x i32>, <4 x i32> addrspace(6)* %i34, align 16 + %i36 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 5.000000e-01, float 5.000000e-01, <8 x i32> %i33, <4 x i32> %i35, i1 false, i32 0, i32 0) #4 + %i37 = extractelement <4 x float> %i36, i32 0 + %i38 = call float @llvm.maxnum.f32(float %i31, float %i37) #4 + %i39 = load <8 x i32>, <8 x i32> addrspace(6)* undef, align 32 + %i40 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(6)* null, i32 91 + %i41 = load <4 x i32>, <4 x i32> addrspace(6)* %i40, align 16 + %i42 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 5.000000e-01, float 5.000000e-01, <8 x i32> %i39, <4 x i32> %i41, i1 false, i32 0, i32 0) #4 + %i43 = extractelement <4 x float> %i42, i32 0 + %i44 = fadd float %i43, 0xBFEDBA5E40000000 + %i45 = call float @llvm.maxnum.f32(float %i38, float %i44) #4 + %i46 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(6)* %arg, i32 46 + %i47 = load <8 x i32>, <8 x i32> addrspace(6)* %i46, align 32 + %i48 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(6)* null, i32 95 + %i49 = load <4 x i32>, <4 x i32> addrspace(6)* %i48, align 16 + %i50 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 5.000000e-01, float 5.000000e-01, <8 x i32> %i47, <4 x i32> %i49, i1 false, i32 0, i32 0) #4 + %i51 = extractelement <4 x float> %i50, i32 0 + %i52 = fadd float %i51, 0xBFDF1A9FC0000000 + %i53 = call float @llvm.maxnum.f32(float %i45, float %i52) #4 + %i54 = fcmp olt float %i53, 0x3FA99999A0000000 + %i55 = select i1 %i54, float 1.000000e+00, float 0.000000e+00 + %i56 = extractelement <4 x float> %i, i32 1 + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %i55, float %i56, float undef, float undef, i1 true, i1 true) #0 + ret void +} + +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 +declare float @llvm.fabs.f32(float) #2 +declare float @llvm.maxnum.f32(float, float) #2 +declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #3 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly willreturn } +attributes #2 = { nofree nosync nounwind readnone speculatable willreturn } +attributes #3 = { inaccessiblememonly nounwind willreturn writeonly } +attributes #4 = { nounwind readnone }