Index: llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h +++ llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h @@ -1136,6 +1136,41 @@ return true; } + // %0:_(s8), %1 = G_UNMERGE_VALUES %a:_(<2 x s8>) + // %2:_(s8), %3 = G_UNMERGE_VALUES %b:_(<2 x s8>) + // %x:_(<4 x s8>) = G_BUILD_VECTOR %0:_(s8), %1, %2, %3 + // + // %x:_(<4 x s8>) = G_CONCAT_VECTOR %a:_(<2 x s8>), %b + if (isMergeOrUnmergeCompatible(DstTy, UnmergeSrcTy)) { + if (!isInstLegal({TargetOpcode::G_CONCAT_VECTORS, {UnmergeSrcTy, DstTy}})) + return false; + SmallVector ConcatSources; + unsigned CurrentIdx = 1; + while (CurrentIdx < MI.getNumOperands()) { + if (!(Unmerge = dyn_cast( + getDefIgnoringCopies(MI.getReg(CurrentIdx), MRI)))) + return false; + + // Sources from all unmerges should have the same type. + if (MRI.getType(Unmerge->getSourceReg()) != UnmergeSrcTy) + return false; + + // %0, %1, ..., %n = G_UNMERGE_VALUES %a + // %x = G_BUILD_VECTOR %a, %b, ..., %0, %1, ..., %n, ... + for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) { + if (MI.getReg(CurrentIdx + i) != Unmerge->getReg(i)) + return false; + } + + ConcatSources.push_back(Unmerge->getSourceReg()); + CurrentIdx += Unmerge->getNumDefs(); + } + + Builder.buildMerge(Dst, ConcatSources); + DeadInsts.push_back(&MI); + return true; + } + return false; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/add.vNi16.ll.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/add.vNi16.ll.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/add.vNi16.ll.mir @@ -205,27 +205,20 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 - ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off - ; GFX10-NEXT: s_clause 0x1 - ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[0:1], off ; GFX10-NEXT: global_load_ushort v10, v[0:1], off offset:8 ; GFX10-NEXT: global_load_ushort v11, v[2:3], off offset:8 - ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff + ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off + ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off + ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 + ; GFX10-NEXT: s_waitcnt vmcnt(3) + ; GFX10-NEXT: v_and_or_b32 v2, v10, v0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(2) - ; GFX10-NEXT: v_pk_add_u16 v0, v8, v6 - ; GFX10-NEXT: v_pk_add_u16 v1, v9, v7 - ; GFX10-NEXT: s_waitcnt vmcnt(1) - ; GFX10-NEXT: v_and_or_b32 v7, v10, v2, s0 + ; GFX10-NEXT: v_and_or_b32 v3, v11, v0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) - ; GFX10-NEXT: v_and_or_b32 v8, v11, v2, s0 - ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 - ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1 - ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 - ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 - ; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 - ; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v6 - ; GFX10-NEXT: v_pk_add_u16 v2, v7, v8 + ; GFX10-NEXT: v_pk_add_u16 v0, v6, v8 + ; GFX10-NEXT: v_pk_add_u16 v1, v7, v9 + ; GFX10-NEXT: v_pk_add_u16 v2, v2, v3 ; GFX10-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: global_store_short v[4:5], v2, off offset:8 %0:_(p1) = COPY $vgpr0_vgpr1 @@ -399,41 +392,31 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 - ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off - ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off ; GFX10-NEXT: global_load_ushort v14, v[0:1], off offset:20 + ; GFX10-NEXT: s_clause 0x1 + ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off ; GFX10-NEXT: global_load_ushort v15, v[2:3], off offset:20 + ; GFX10-NEXT: s_clause 0x1 + ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off ; GFX10-NEXT: global_load_dword v16, v[0:1], off offset:16 ; GFX10-NEXT: global_load_dword v17, v[2:3], off offset:16 - ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff + ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 - ; GFX10-NEXT: s_waitcnt vmcnt(4) - ; GFX10-NEXT: v_pk_add_u16 v0, v6, v10 - ; GFX10-NEXT: v_pk_add_u16 v2, v8, v12 - ; GFX10-NEXT: v_pk_add_u16 v1, v7, v11 - ; GFX10-NEXT: v_pk_add_u16 v6, v9, v13 + ; GFX10-NEXT: s_waitcnt vmcnt(5) + ; GFX10-NEXT: v_and_or_b32 v14, v14, v0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(3) - ; GFX10-NEXT: v_and_or_b32 v7, v14, v3, s0 - ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0 - ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 - ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v1 - ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v6 + ; GFX10-NEXT: v_and_or_b32 v15, v15, v0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(2) - ; GFX10-NEXT: v_and_or_b32 v12, v15, v3, s0 - ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 - ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10 - ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9 - ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11 + ; GFX10-NEXT: v_pk_add_u16 v0, v10, v6 + ; GFX10-NEXT: v_pk_add_u16 v1, v11, v7 + ; GFX10-NEXT: v_pk_add_u16 v2, v12, v8 + ; GFX10-NEXT: v_pk_add_u16 v3, v13, v9 ; GFX10-NEXT: s_waitcnt vmcnt(0) - ; GFX10-NEXT: v_pk_add_u16 v13, v16, v17 - ; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v8 - ; GFX10-NEXT: v_and_or_b32 v2, v2, v3, v10 - ; GFX10-NEXT: v_and_or_b32 v1, v1, v3, v9 - ; GFX10-NEXT: v_and_or_b32 v3, v6, v3, v11 - ; GFX10-NEXT: v_pk_add_u16 v6, v7, v12 - ; GFX10-NEXT: global_store_dword v[4:5], v13, off offset:16 + ; GFX10-NEXT: v_pk_add_u16 v6, v16, v17 + ; GFX10-NEXT: v_pk_add_u16 v7, v14, v15 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off - ; GFX10-NEXT: global_store_short v[4:5], v6, off offset:20 + ; GFX10-NEXT: global_store_dword v[4:5], v6, off offset:16 + ; GFX10-NEXT: global_store_short v[4:5], v7, off offset:20 %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(p1) = COPY $vgpr2_vgpr3 %2:_(p1) = COPY $vgpr4_vgpr5 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-add.vNi16-build-vector.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-add.vNi16-build-vector.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-add.vNi16-build-vector.mir @@ -229,18 +229,11 @@ ; GFX10: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[UV]], [[UV4]] ; GFX10: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[UV3]], [[UV7]] ; GFX10: [[ADD2:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC1]] - ; GFX10: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[ADD]](<2 x s16>) - ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) - ; GFX10: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[ADD1]](<2 x s16>) - ; GFX10: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX10: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[ADD2]](<2 x s16>) - ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[LSHR1]](s32) - ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX10: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[ADD2]](<2 x s16>) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[ADD]](<2 x s16>), [[ADD1]](<2 x s16>) ; GFX10: G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[COPY2]](p1) :: (store (<4 x s16>), align 4, addrspace 1) ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C]](s64) - ; GFX10: G_STORE [[BITCAST2]](s32), [[PTR_ADD2]](p1) :: (store (s16), align 4, addrspace 1) + ; GFX10: G_STORE [[BITCAST]](s32), [[PTR_ADD2]](p1) :: (store (s16), align 4, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(p1) = COPY $vgpr2_vgpr3 %2:_(p1) = COPY $vgpr4_vgpr5 @@ -460,27 +453,14 @@ ; GFX10: [[ADD3:%[0-9]+]]:_(<2 x s16>) = G_ADD [[UV15]], [[UV31]] ; GFX10: [[ADD4:%[0-9]+]]:_(<2 x s16>) = G_ADD [[LOAD1]], [[LOAD4]] ; GFX10: [[ADD5:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC1]] - ; GFX10: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[ADD]](<2 x s16>) - ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C2]](s32) - ; GFX10: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[ADD1]](<2 x s16>) - ; GFX10: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C2]](s32) - ; GFX10: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[ADD2]](<2 x s16>) - ; GFX10: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C2]](s32) - ; GFX10: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[ADD3]](<2 x s16>) - ; GFX10: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C2]](s32) - ; GFX10: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[ADD5]](<2 x s16>) - ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[LSHR1]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR2]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST5]](s32), [[LSHR3]](s32) - ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>) - ; GFX10: [[BITCAST7:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s16>) - ; GFX10: G_STORE [[BITCAST7]](<4 x s32>), [[COPY2]](p1) :: (store (<4 x s32>), align 4, addrspace 1) + ; GFX10: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[ADD5]](<2 x s16>) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[ADD]](<2 x s16>), [[ADD1]](<2 x s16>), [[ADD2]](<2 x s16>), [[ADD3]](<2 x s16>) + ; GFX10: [[BITCAST3:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s16>) + ; GFX10: G_STORE [[BITCAST3]](<4 x s32>), [[COPY2]](p1) :: (store (<4 x s32>), align 4, addrspace 1) ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C]](s64) ; GFX10: G_STORE [[ADD4]](<2 x s16>), [[PTR_ADD4]](p1) :: (store (<2 x s16>), addrspace 1) ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C1]](s64) - ; GFX10: G_STORE [[BITCAST6]](s32), [[PTR_ADD5]](p1) :: (store (s16), align 4, addrspace 1) + ; GFX10: G_STORE [[BITCAST2]](s32), [[PTR_ADD5]](p1) :: (store (s16), align 4, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(p1) = COPY $vgpr2_vgpr3 %2:_(p1) = COPY $vgpr4_vgpr5 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir @@ -130,10 +130,8 @@ ; GFX9-LABEL: name: concat ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32) - ; GFX9: $vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s32>) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[COPY]](<2 x s32>), [[COPY1]](<2 x s32>) + ; GFX9: $vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) @@ -150,9 +148,8 @@ ; GFX9-LABEL: name: concat_same_vector ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV]](s32), [[UV1]](s32) - ; GFX9: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[COPY]](<2 x s32>), [[COPY]](<2 x s32>) + ; GFX9: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[CONCAT_VECTORS]](<4 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) %3:_(<4 x s32>) = G_BUILD_VECTOR %1, %2, %1, %2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir @@ -3018,30 +3018,24 @@ ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; CI: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16) - ; CI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>) - ; CI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<4 x s32>) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32), [[UV7]](s32) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>) + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>) ; VI-LABEL: name: test_load_flat_v32s8_align32 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; VI: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16) - ; VI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>) - ; VI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<4 x s32>) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32), [[UV7]](s32) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>) + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>) ; GFX9-LABEL: name: test_load_flat_v32s8_align32 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16) - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>) - ; GFX9: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<4 x s32>) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32), [[UV7]](s32) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<32 x s8>) = G_LOAD %0 :: (load (<32 x s8>), align 32, addrspace 0) %2:_(<8 x s32>) = G_BITCAST %1