Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16667,6 +16667,10 @@ // (vextract (scalar_to_vector val, 0) -> val if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) { + // Only 0'th element of SCALAR_TO_VECTOR is defined. + if (DAG.isKnownNeverZero(Index)) + return DAG.getUNDEF(ScalarVT); + // Check if the result type doesn't match the inserted element type. A // SCALAR_TO_VECTOR may truncate the inserted element and the // EXTRACT_VECTOR_ELT may widen the extracted vector. Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -387,17 +387,15 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v3, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v3 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v7i8_to_v7f32: @@ -415,45 +413,40 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v10, v[4:5] -; VI-NEXT: flat_load_ubyte v11, v[2:3] -; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 6, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 6, v0 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: flat_load_ubyte v1, v[8:9] +; VI-NEXT: flat_load_ubyte v1, v[12:13] +; VI-NEXT: flat_load_ubyte v10, v[10:11] +; VI-NEXT: flat_load_ubyte v8, v[8:9] ; VI-NEXT: flat_load_ubyte v7, v[6:7] ; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 -; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v1 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_or_b32_sdwa v1, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v4, v10 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v4, v4, v7 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 +; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid Index: test/CodeGen/AMDGPU/max.i16.ll =================================================================== --- test/CodeGen/AMDGPU/max.i16.ll +++ test/CodeGen/AMDGPU/max.i16.ll @@ -160,6 +160,8 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 @@ -167,20 +169,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v6, v[2:3], off -; GFX9-NEXT: global_load_dword v7, v[0:1], off +; GFX9-NEXT: global_load_short_d16 v7, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v8, v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_short_d16 v6, v[2:3], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_pk_max_i16 v0, v0, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_pk_max_i16 v7, v7, v6 -; GFX9-NEXT: global_load_short_d16 v6, v[2:3], off offset:4 -; GFX9-NEXT: global_load_short_d16 v8, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v0, v8, v6 -; GFX9-NEXT: global_store_dword v[4:5], v7, off -; GFX9-NEXT: global_store_short v[4:5], v0, off offset:4 +; GFX9-NEXT: v_pk_max_i16 v1, v7, v6 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4 ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid