Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16017,6 +16017,10 @@ // (vextract (scalar_to_vector val, 0) -> val if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) { + // Only 0'th element of SCALAR_TO_VECTOR is defined. + if (DAG.isKnownNeverZero(Index)) + return DAG.getUNDEF(ScalarVT); + // Check if the result type doesn't match the inserted element type. A // SCALAR_TO_VECTOR may truncate the inserted element and the // EXTRACT_VECTOR_ELT may widen the extracted vector. Index: test/CodeGen/AMDGPU/max.i16.ll =================================================================== --- test/CodeGen/AMDGPU/max.i16.ll +++ test/CodeGen/AMDGPU/max.i16.ll @@ -160,6 +160,8 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 @@ -167,20 +169,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v6, v[2:3], off -; GFX9-NEXT: global_load_dword v7, v[0:1], off +; GFX9-NEXT: global_load_short_d16 v7, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v8, v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_short_d16 v6, v[2:3], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_pk_max_i16 v0, v0, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_pk_max_i16 v7, v7, v6 -; GFX9-NEXT: global_load_short_d16 v6, v[2:3], off offset:4 -; GFX9-NEXT: global_load_short_d16 v8, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v0, v8, v6 -; GFX9-NEXT: global_store_dword v[4:5], v7, off -; GFX9-NEXT: global_store_short v[4:5], v0, off offset:4 +; GFX9-NEXT: v_pk_max_i16 v1, v7, v6 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4 ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid