Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4369,12 +4369,11 @@ MVT IntVT = MVT::getIntegerVT(VecSize); // Avoid stack access for dynamic indexing. - SDValue Val = InsVal; - if (InsVal.getValueType() == MVT::f16) - Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal); - // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec - SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val); + + // Duplicate the value so that it doesn't matter which element is written + SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, + DAG.getSplatBuildVector(VecVT, SL, InsVal)); assert(isPowerOf2_32(EltSize)); SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); Index: test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -814,8 +814,8 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16: -; GFX9: v_pk_mul_f16 ; GFX9: v_mul_f16_e32 +; GFX9: v_pk_mul_f16 ; GFX9-NOT: v_max ; GFX9-NOT: v_pk_max define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) { Index: test/CodeGen/AMDGPU/insert_vector_dynelt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -112,7 +112,10 @@ ; GCN-NOT: buffer_ ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c00 +; GCN: s_mov_b32 [[K:s[0-9]+]], 0x3c003c00 +; GCN: v_mov_b32_e32 [[V:v[0-9]+]], [[K]] +; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}} +; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}} define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) { entry: %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel @@ -168,9 +171,10 @@ ; GCN-NOT: v_cndmask_b32 ; GCN-NOT: v_movrel ; GCN-NOT: buffer_ +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x10001 ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 ; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]] -; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], 1, v{{[0-9]+}} +; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], [[K]], v{{[0-9]+}} define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) { entry: %v = insertelement <2 x i16> %vec, i16 1, i32 %sel @@ -184,7 +188,10 @@ ; GCN-NOT: buffer_ ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +; GCN: s_mov_b32 [[K:s[0-9]+]], 0x10001 +; GCN: v_mov_b32_e32 [[V:v[0-9]+]], [[K]] +; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}} +; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}} define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) { entry: %v = insertelement <4 x i16> %vec, i16 1, i32 %sel @@ -197,7 +204,11 @@ ; GCN-NOT: buffer_ ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +; GCN: s_mov_b32 [[K:s[0-9]+]], 0x1010101 +; GCN: s_and_b32 s3, s1, [[K]] +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] +; GCN: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) { entry: %v = insertelement <8 x i8> %vec, i8 1, i32 %sel Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -242,7 +242,7 @@ ; VI-NOT: _load ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1 -; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]] +; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 0x505, [[MASK]] ; VI: v_xor_b32_e32 [[NOT_MASK:v[0-9]+]], -1, [[MASK]] ; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[LOAD]], [[NOT_MASK]] ; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[AND_NOT_MASK]] @@ -261,15 +261,14 @@ ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c ; VI-NOT: _load +; VI: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x5050505 ; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] -; VI: s_andn2_b32 [[AND_NOT_MASK:s[0-9]+]], [[LOAD]], [[SHIFTED_MASK]] -; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]] -; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16 +; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], [[VAL]], [[V_LOAD]] +; VI: v_lshrrev_b32_e32 [[V_HI2:v[0-9]+]], 16, [[BFI]] -; VI-DAG: buffer_store_short [[BFI]] -; VI-DAG: v_mov_b32_e32 [[V_HI2:v[0-9]+]], [[HI2]] +; VI: buffer_store_short [[BFI]] ; VI: buffer_store_byte [[V_HI2]] define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { %vecins = insertelement <3 x i8> %a, i8 5, i32 %b @@ -282,10 +281,11 @@ ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c ; VI-NOT: _load +; VI: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x5050505 ; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] -; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]] +; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], [[VAL]], [[V_LOAD]] ; VI: buffer_store_dword [[BFI]] define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { %vecins = insertelement <4 x i8> %a, i8 5, i32 %b @@ -303,9 +303,11 @@ ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff ; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]] +; VI: s_mov_b32 [[VAL:s[0-9]+]], 0x5050505 +; VI: s_and_b32 s[[INS_HI:[0-9]+]], s[[MASK_SHIFT_HI]], [[VAL]] +; VI: s_and_b32 s[[INS_LO:[0-9]+]], s[[MASK_SHIFT_LO]], [[VAL]] ; VI: s_andn2_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[VEC]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}} -; VI: s_and_b32 s[[INS:[0-9]+]], s[[MASK_SHIFT_LO]], 5 -; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS]]:[[MASK_HI]]{{\]}}, [[AND]] +; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS_LO]]:[[INS_HI]]{{\]}}, [[AND]] ; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]] ; VI: v_mov_b32_e32 v[[V_RESULT1:[0-9]+]], s[[RESULT1]] ; VI: buffer_store_dwordx2 v{{\[}}[[V_RESULT0]]:[[V_RESULT1]]{{\]}} Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -446,7 +446,7 @@ ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234 +; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x12341234 ; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] @@ -611,25 +611,20 @@ ; GCN-DAG: s_load_dword [[VAL:s[0-9]+]] ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; GCN-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff ; GCN-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0 -; GCN-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff{{$}} - -; GFX89: v_lshlrev_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, [[SCALED_IDX]], s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}} -; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_LO:[0-9+]]], v[[SHIFT_LO]] -; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_HI:[0-9+]]], v[[SHIFT_HI]] -; GFX89-DAG: v_and_b32_e32 v[[MASK:[0-9]+]], [[VAL]], v[[SHIFT_LO]] - -; GFX89-DAG: v_and_b32_e32 v[[AND0:[0-9]+]], v[[NOT_SHIFT_LO]], v[[LO]] -; GFX89-DAG: v_and_b32_e32 v[[AND1:[0-9]+]], v[[NOT_SHIFT_HI]], v[[HI]] -; GFX89: v_or_b32_sdwa v[[OR_SDWA:[0-9]+]], v[[MASK]], v[[AND0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD - - -; CI: v_lshl_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]] -; CI-DAG: v_bfi_b32 v[[OR_SDWA:[0-9]+]], v[[SHIFT_LO]], -; CI-DAG: v_bfi_b32 v[[AND1:[0-9]+]], v[[SHIFT_HI]], 0, +; CIVI-DAG: s_and_b32 [[MASKED_VAL:s[0-9]+]], [[VAL]], s[[MASK_LO]] +; VI-DAG: s_lshl_b32 [[SHIFTED_VAL:s[0-9]+]], [[MASKED_VAL]], 16 +; CI-DAG: s_lshl_b32 [[SHIFTED_VAL:s[0-9]+]], [[VAL]], 16 +; CIVI: s_or_b32 [[DUP_VAL:s[0-9]+]], [[MASKED_VAL]], [[SHIFTED_VAL]] +; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; GFX9-DAG: s_pack_ll_b32_b16 [[DUP_VAL:s[0-9]+]], [[VAL]], [[VAL]] +; GFX89: v_lshlrev_b64 v[{{[0-9:]+}}], [[SCALED_IDX]], s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}} +; CI: v_lshl_b64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SCALED_IDX]] +; GCN: v_bfi_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[DUP_VAL]], v{{[0-9]+}} +; GCN: v_bfi_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[DUP_VAL]], v{{[0-9]+}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[OR_SDWA]]:[[AND1]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 +; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e703e7 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll @@ -6,7 +6,7 @@ ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 +; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e7 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]