diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17100,6 +17100,10 @@ // (vextract (scalar_to_vector val, 0) -> val if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) { + // Only 0'th element of SCALAR_TO_VECTOR is defined. + if (DAG.isKnownNeverZero(Index)) + return DAG.getUNDEF(ScalarVT); + // Check if the result type doesn't match the inserted element type. A // SCALAR_TO_VECTOR may truncate the inserted element and the // EXTRACT_VECTOR_ELT may widen the extracted vector. diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -388,17 +388,15 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v3, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v3 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v7i8_to_v7f32: @@ -437,23 +435,20 @@ ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_or_b32_e32 v4, v3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v8 ; VI-NEXT: v_or_b32_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v4 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 +; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -160,6 +160,8 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 @@ -167,22 +169,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v6, v[0:1], off -; GFX9-NEXT: global_load_dword v7, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: global_load_short_d16 v8, v[0:1], off offset:4 -; GFX9-NEXT: global_load_short_d16 v9, v[2:3], off offset:4 -; GFX9-NEXT: v_pk_max_i16 v6, v6, v7 +; GFX9-NEXT: global_load_short_d16 v5, v[2:3], off offset:4 +; GFX9-NEXT: global_load_short_d16 v6, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v7, v[0:1], off +; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_pk_max_i16 v3, v6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v0, v8, v9 -; GFX9-NEXT: global_store_dword v[4:5], v6, off -; GFX9-NEXT: global_store_short v[4:5], v0, off offset:4 +; GFX9-NEXT: v_pk_max_i16 v2, v7, v2 +; GFX9-NEXT: global_store_short v[0:1], v3, off offset:4 +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid