diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -99,6 +99,12 @@ "amdgpu-reserve-vgpr-for-sgpr-spill", cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true)); +static cl::opt UseDivergentRegisterIndexing( + "amdgpu-use-divergent-register-indexing", + cl::Hidden, + cl::desc("Use indirect register addressing for divergent indexes"), + cl::init(false)); + static bool hasFP32Denormals(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo(); return Info->getMode().allFP32Denormals(); @@ -9533,7 +9539,10 @@ // Sub-dword vectors of size 2 dword or less have better implementation. // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 // instructions. - if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) && + // Always do this if var-idx is divergent, otherwise it will become a loop. + if (!UseDivergentRegisterIndexing && + (VecSize <= 256 || N->getOperand(1)->isDivergent()) && + (VecSize > 64 || EltSize >= 32) && !isa(N->getOperand(1))) { SDLoc SL(N); SDValue Idx = N->getOperand(1); @@ -9603,8 +9612,10 @@ // Sub-dword vectors of size 2 dword or less have better implementation. // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 // instructions. - if (isa(Idx) || - VecSize > 256 || (VecSize <= 64 && EltSize < 32)) + // Always do this if var-idx is divergent, otherwise it will become a loop. + if (UseDivergentRegisterIndexing || isa(Idx) || + (VecSize > 256 && !Idx->isDivergent()) || + (VecSize <= 64 && EltSize < 32)) return SDValue(); SelectionDAG &DAG = DCI.DAG; diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -384,3 +384,31 @@ store i32 %zext, i32 addrspace(1)* %out ret void } + +; GCN-LABEL: {{^}}float32_extelt_vec: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC1:[^,]+]], 1, v0 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 1.0, 2.0, [[CC1]] +; GCN-DAG: v_mov_b32_e32 [[LASTVAL:v[0-9]+]], 0x42000000 +; GCN-DAG: v_cmp_ne_u32_e32 [[LASTCC:[^,]+]], 31, v0 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v0, [[LASTVAL]], v{{[0-9]+}}, [[LASTCC]] +define float @float32_extelt_vec(i32 %sel) { +entry: + %ext = extractelement <32 x float> , i32 %sel + ret float %ext +} + +; GCN-LABEL: {{^}}double16_extelt_vec: +; GCN-NOT: buffer_ +; GCN-DAG: v_mov_b32_e32 [[V1HI:v[0-9]+]], 0x3ff19999 +; GCN-DAG: v_mov_b32_e32 [[V1LO:v[0-9]+]], 0x9999999a +; GCN-DAG: v_mov_b32_e32 [[V2HI:v[0-9]+]], 0x4000cccc +; GCN-DAG: v_mov_b32_e32 [[V2LO:v[0-9]+]], 0xcccccccd +; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC1:[^,]+]], 1, v0 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[R1HI:v[0-9]+]], [[V1HI]], [[V2HI]], [[CC1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[R1LO:v[0-9]+]], [[V1LO]], [[V2LO]], [[CC1]] +define double @double16_extelt_vec(i32 %sel) { +entry: + %ext = extractelement <16 x double> , i32 %sel + ret double %ext +} diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll @@ -14,46 +14,10 @@ ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]] ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] -; GCN-DAG: v_add_u32_e32 [[IDX1:v[0-9]+]], 1, [[IDX0]] +; GCN: v_cmp_eq_u32_e32 +; GCN-COUNT-32: v_cndmask_b32 -; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] -; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] -; GCN: s_and_saveexec_b64 vcc, vcc - -; MOVREL: s_mov_b32 m0, [[READLANE]] -; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]] - -; IDXMODE: s_set_gpr_idx_on [[READLANE]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]] -; IDXMODE: s_set_gpr_idx_off - -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN: s_cbranch_execnz [[LOOP0]] - -; FIXME: Redundant copy -; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]] - -; GCN: s_mov_b64 [[MASK]], exec - -; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX1]] -; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX1]] -; GCN: s_and_saveexec_b64 vcc, vcc - -; MOVREL: s_mov_b32 m0, [[READLANE]] -; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63 - -; IDXMODE: s_set_gpr_idx_on [[READLANE]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63 -; IDXMODE: s_set_gpr_idx_off - -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN: s_cbranch_execnz [[LOOP1]] - -; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]: - -; GCN: buffer_store_dword [[INS0]] +; GCN-COUNT-4: buffer_store_dwordx4 define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll @@ -19,44 +19,10 @@ ; GCN-DAG: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], vcc, 1, [[IDX0]] -; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] -; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] -; GCN: s_and_saveexec_b64 vcc, vcc +; GCN: v_cmp_eq_u32_e32 +; GCN-COUNT-32: v_cndmask_b32 -; MOVREL: s_mov_b32 m0, [[READLANE]] -; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]] - -; IDXMODE: s_set_gpr_idx_on [[READLANE]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]] -; IDXMODE: s_set_gpr_idx_off - -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN: s_cbranch_execnz [[LOOP0]] - -; FIXME: Redundant copy -; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]] - -; GCN: s_mov_b64 [[MASK]], exec - -; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX1]] -; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX1]] -; GCN: s_and_saveexec_b64 vcc, vcc - -; MOVREL: s_mov_b32 m0, [[READLANE]] -; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63 - -; IDXMODE: s_set_gpr_idx_on [[READLANE]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63 -; IDXMODE: s_set_gpr_idx_off - -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN: s_cbranch_execnz [[LOOP1]] - -; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]: - -; GCN: buffer_store_dword [[INS0]] +; GCN-COUNT-4: buffer_store_dwordx4 define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -131,22 +131,9 @@ ; GCN-LABEL: {{^}}extract_neg_offset_vgpr: ; The offset depends on the register that holds the first element of the vector. -; FIXME: The waitcnt for the argument load can go after the loop -; GCN: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec -; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: -; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}} -; GCN: s_and_saveexec_b64 vcc, vcc - -; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe0 -; MOVREL: v_movrels_b32_e32 [[RESULT:v[0-9]+]], v1 - -; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00 -; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0) -; IDXMODE: v_mov_b32_e32 [[RESULT:v[0-9]+]], v1 -; IDXMODE: s_set_gpr_idx_off - -; GCN: s_cbranch_execnz - +; GCN: v_cmp_eq_u32_e32 +; GCN-COUNT-14: v_cndmask_b32 +; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 16 ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) { entry: @@ -301,40 +288,9 @@ ; GCN-LABEL: {{^}}insert_neg_offset_vgpr: ; The offset depends on the register that holds the first element of the vector. -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}} - -; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec -; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: -; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]] -; GCN: s_and_saveexec_b64 vcc, vcc - -; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00 -; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 33 - -; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} -; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST) -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 33 -; IDXMODE: s_set_gpr_idx_off - -; GCN: s_cbranch_execnz [[LOOPBB]] -; GCN: s_mov_b64 exec, [[SAVEEXEC]] - -; GCN: buffer_store_dword +; GCN: v_cmp_eq_u32_e32 +; GCN-COUNT-16: v_cndmask_b32 +; GCN-COUNT-4: buffer_store_dwordx4 define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -346,38 +302,9 @@ ; GCN-LABEL: {{^}}insert_neg_inline_offset_vgpr: -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}} -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}} -; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}} - -; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec - -; The offset depends on the register that holds the first element of the vector. -; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]] - -; MOVREL: s_add_i32 m0, [[READLANE]], -16 -; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], [[VAL]] - -; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[READLANE]], -16 -; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST) -; IDXMODE: v_mov_b32_e32 [[VEC_ELT0]], [[VAL]] -; IDXMODE: s_set_gpr_idx_off - -; GCN: s_cbranch_execnz +; GCN: v_cmp_eq_u32_e32 +; GCN-COUNT-16: v_cndmask_b32 +; GCN-COUNT-4: buffer_store_dwordx4 define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -392,60 +319,13 @@ ; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block: -; FIXME: Why is vector copied in between? - ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] -; GCN-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9 -; GCN-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7 -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]] -; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]] - -; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec - -; GCN: s_waitcnt vmcnt(0) -; PREGFX9: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], vcc, 1, [[IDX0]] -; GFX9: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], 1, [[IDX0]] - - -; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] -; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] -; GCN: s_and_saveexec_b64 vcc, vcc - -; MOVREL: s_mov_b32 m0, [[READLANE]] -; MOVREL: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]] - -; IDXMODE: s_set_gpr_idx_on [[READLANE]], gpr_idx(SRC0) -; IDXMODE: v_mov_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]] -; IDXMODE: s_set_gpr_idx_off - -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN-NEXT: s_cbranch_execnz [[LOOP0]] - -; FIXME: Redundant copy -; GCN: s_mov_b64 exec, [[MASK]] - -; GCN: v_mov_b32_e32 [[VEC_ELT0_2:v[0-9]+]], [[S_ELT0]] - -; GCN: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec - -; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX1]] -; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX1]] -; GCN: s_and_saveexec_b64 vcc, vcc - -; MOVREL: s_mov_b32 m0, [[READLANE]] -; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT0_2]] - -; IDXMODE: s_set_gpr_idx_on [[READLANE]], gpr_idx(SRC0) -; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT0_2]] -; IDXMODE: s_set_gpr_idx_off - -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN: s_cbranch_execnz [[LOOP1]] +; GCN: v_cmp_eq_u32 +; GCN: v_cndmask_b32_e64 [[RESULT0:v[0-9]+]], 16, +; GCN: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 16, -; GCN: buffer_store_dword [[MOVREL0]] -; GCN: buffer_store_dword [[MOVREL1]] +; GCN: buffer_store_dword [[RESULT0]] +; GCN: buffer_store_dword [[RESULT1]] define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -363,3 +363,37 @@ store <128 x i1> %v, <128 x i1> addrspace(1)* %out ret void } + +; GCN-LABEL: {{^}}float32_inselt_vec: +; GCN-NOT: buffer_ +; GCN-COUNT-32: v_cmp_ne_u32 +; GCN-COUNT-32: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, +define amdgpu_ps <32 x float> @float32_inselt_vec(<32 x float> %vec, i32 %sel) { +entry: + %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel + ret <32 x float> %v +} + +; GCN-LABEL: {{^}}double8_inselt_vec: +; GCN-NOT: buffer_ +; GCN: v_cmp_eq_u32 +; GCN-COUNT-2: v_cndmask_b32 +; GCN: v_cmp_eq_u32 +; GCN-COUNT-2: v_cndmask_b32 +; GCN: v_cmp_eq_u32 +; GCN-COUNT-2: v_cndmask_b32 +; GCN: v_cmp_eq_u32 +; GCN-COUNT-2: v_cndmask_b32 +; GCN: v_cmp_eq_u32 +; GCN-COUNT-2: v_cndmask_b32 +; GCN: v_cmp_eq_u32 +; GCN-COUNT-2: v_cndmask_b32 +; GCN: v_cmp_eq_u32 +; GCN-COUNT-2: v_cndmask_b32 +; GCN: v_cmp_eq_u32 +; GCN-COUNT-2: v_cndmask_b32 +define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) { +entry: + %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel + ret <8 x double> %v +} diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -filetype=obj < %s | llvm-readobj -r | FileCheck --check-prefix=RELS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W64,GFX9_10 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -filetype=obj -amdgpu-use-divergent-register-indexing < %s | llvm-readobj -r | FileCheck --check-prefix=RELS %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W64,GFX9_10 %s ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD0 0x0 ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD1 0x0