diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5882,10 +5882,19 @@ assert(VecSize <= 64); + MVT IntVT = MVT::getIntegerVT(VecSize); + + // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly. + SDValue VecBC = peekThroughBitcasts(Vec); + if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) { + SDValue Src = VecBC.getOperand(0); + Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src); + Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT); + } + unsigned EltSize = EltVT.getSizeInBits(); assert(isPowerOf2_32(EltSize)); - MVT IntVT = MVT::getIntegerVT(VecSize); SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); // Convert vector index to bit-index (* EltSize) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -174,12 +174,11 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 -; VI-NEXT: s_or_b32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_alignbit_b32 v2, s0, v2, 16 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -230,13 +229,13 @@ ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 -; VI-NEXT: s_or_b32 s1, s0, s1 -; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: ; use s0 @@ -299,14 +298,13 @@ ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; VI-NEXT: s_or_b32 s2, s0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: ; use s0 @@ -585,11 +583,10 @@ ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, s0, v2 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; VI-NEXT: v_alignbit_b32 v2, v2, s4, 16 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -284,10 +284,9 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u8 v0, v0 -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -779,10 +778,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -867,10 +865,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1036,10 +1033,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1118,10 +1114,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1502,10 +1497,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 -; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1604,9 +1598,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1706,9 +1699,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1845,10 +1837,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -2085,9 +2076,8 @@ ; GFX803-NEXT: v_mov_b32_e32 v2, 44 ; GFX803-NEXT: buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 -; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -2229,9 +2219,8 @@ ; GFX803-NEXT: v_mov_b32_e32 v2, 44 ; GFX803-NEXT: buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 -; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31]