Index: llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h +++ llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h @@ -1158,6 +1158,41 @@ return true; } + // Combine when types are compatible (vectors and DstTy = N x UnmergeSrcTy). + // %0:_(s8), %1 = G_UNMERGE_VALUES %a:_(<2 x s8>) + // %2:_(s8), %3 = G_UNMERGE_VALUES %b:_(<2 x s8>) + // %x:_(<4 x s8>) = G_BUILD_VECTOR %0:_(s8), %1, %2, %3 + // + // %x:_(<4 x s8>) = G_CONCAT_VECTOR %a:_(<2 x s8>), %b + + if (UnmergeSrcTy.isVector() && DstTy.isVector() && + getCoverTy(DstTy, UnmergeSrcTy) == DstTy) { + SmallVector ConcatSources; + unsigned EltIdx = 1; + while (EltIdx < MI.getNumOperands()) { + if (!(Unmerge = dyn_cast(MRI.getVRegDef(MI.getReg(EltIdx))))) + return false; + + // Sources from all unmerges should have the same type. + if (MRI.getType(Unmerge->getSourceReg()) != UnmergeSrcTy) + return false; + + // %0, %1, ..., %n = G_UNMERGE_VALUES %a + // %x = G_BUILD_VECTOR %a, %b, ..., %0, %1, ..., %n, ... + for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) { + if (MI.getReg(EltIdx + i) != Unmerge->getReg(i)) + return false; + } + + ConcatSources.push_back(Unmerge->getSourceReg()); + EltIdx += Unmerge->getNumDefs(); + } + + Builder.buildMerge(Dst, ConcatSources); + DeadInsts.push_back(&MI); + return true; + } + return false; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll @@ -576,32 +576,20 @@ ; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16 ; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_or_b32 v0, v14, v3, s4 +; GFX9-NEXT: v_and_or_b32 v14, v14, v0, s4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_or_b32 v1, v15, v3, s4 +; GFX9-NEXT: v_and_or_b32 v15, v15, v0, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v6, v10 -; GFX9-NEXT: v_pk_add_u16 v6, v7, v11 -; GFX9-NEXT: v_pk_add_u16 v7, v8, v12 -; GFX9-NEXT: v_pk_add_u16 v8, v9, v13 -; GFX9-NEXT: v_pk_add_u16 v9, v0, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_and_or_b32 v0, v2, v3, v0 -; GFX9-NEXT: v_and_or_b32 v1, v6, v3, v1 -; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v10 -; GFX9-NEXT: v_and_or_b32 v3, v8, v3, v11 +; GFX9-NEXT: v_pk_add_u16 v0, v6, v10 +; GFX9-NEXT: v_pk_add_u16 v1, v7, v11 +; GFX9-NEXT: v_pk_add_u16 v2, v8, v12 +; GFX9-NEXT: v_pk_add_u16 v3, v9, v13 +; GFX9-NEXT: v_pk_add_u16 v6, v14, v15 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off -; GFX9-NEXT: global_store_short v[4:5], v9, off offset:16 +; GFX9-NEXT: global_store_short v[4:5], v6, off offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %a = load <9 x i16>, <9 x i16> addrspace(1)* %ptra, align 4 @@ -768,51 +756,38 @@ ; GFX9-LABEL: addv11i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off ; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:18 ; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:20 -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off ; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:18 ; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:20 -; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off ; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:16 ; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:16 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v14 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_and_or_b32 v1, v15, v3, s4 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; GFX9-NEXT: v_pk_add_u16 v0, v6, v10 +; GFX9-NEXT: v_pk_add_u16 v1, v7, v11 +; GFX9-NEXT: v_pk_add_u16 v3, v9, v13 +; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v14 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_and_or_b32 v14, v17, v3, s4 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v16 +; GFX9-NEXT: v_pk_add_u16 v2, v8, v12 +; GFX9-NEXT: v_and_or_b32 v8, v15, v6, s4 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_pk_add_u16 v6, v10, v6 -; GFX9-NEXT: v_pk_add_u16 v7, v11, v7 -; GFX9-NEXT: v_pk_add_u16 v8, v12, v8 -; GFX9-NEXT: v_pk_add_u16 v9, v13, v9 +; GFX9-NEXT: v_and_or_b32 v10, v17, v6, s4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_or_b32 v0, v18, v3, v0 +; GFX9-NEXT: v_and_or_b32 v7, v18, v6, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v19, v3, v2 -; GFX9-NEXT: v_pk_add_u16 v10, v1, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v9 -; GFX9-NEXT: v_pk_add_u16 v14, v0, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; GFX9-NEXT: v_and_or_b32 v0, v6, v3, v0 -; GFX9-NEXT: v_and_or_b32 v1, v7, v3, v1 -; GFX9-NEXT: v_and_or_b32 v2, v8, v3, v2 -; GFX9-NEXT: v_and_or_b32 v3, v9, v3, v11 -; GFX9-NEXT: global_store_short v[4:5], v14, off offset:16 -; GFX9-NEXT: global_store_short_d16_hi v[4:5], v14, off offset:18 +; GFX9-NEXT: v_and_or_b32 v6, v19, v6, v9 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off -; GFX9-NEXT: global_store_short v[4:5], v10, off offset:20 +; GFX9-NEXT: v_pk_add_u16 v8, v8, v10 +; GFX9-NEXT: v_pk_add_u16 v0, v7, v6 +; GFX9-NEXT: global_store_short v[4:5], v0, off offset:16 +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:18 +; GFX9-NEXT: global_store_short v[4:5], v8, off offset:20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %a = load <11 x i16>, <11 x i16> addrspace(1)* %ptra, align 4 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir @@ -161,10 +161,8 @@ ; GFX9-LABEL: name: concat ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32) - ; GFX9-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s32>) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[COPY]](<2 x s32>), [[COPY1]](<2 x s32>) + ; GFX9-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) @@ -181,9 +179,8 @@ ; GFX9-LABEL: name: concat_same_vector ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV]](s32), [[UV1]](s32) - ; GFX9-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[COPY]](<2 x s32>), [[COPY]](<2 x s32>) + ; GFX9-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[CONCAT_VECTORS]](<4 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) %3:_(<4 x s32>) = G_BUILD_VECTOR %1, %2, %1, %2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir @@ -2852,10 +2852,8 @@ ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16) - ; CI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>) - ; CI-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<4 x s32>) - ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32), [[UV7]](s32) - ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>) + ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>) ; VI-LABEL: name: test_load_flat_v32s8_align32 ; VI: liveins: $vgpr0_vgpr1 ; VI-NEXT: {{ $}} @@ -2864,10 +2862,8 @@ ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16) - ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>) - ; VI-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<4 x s32>) - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32), [[UV7]](s32) - ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>) + ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>) ; GFX9-LABEL: name: test_load_flat_v32s8_align32 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} @@ -2876,10 +2872,8 @@ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16) - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>) - ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<4 x s32>) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32), [[UV7]](s32) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<32 x s8>) = G_LOAD %0 :: (load (<32 x s8>), align 32, addrspace 0) %2:_(<8 x s32>) = G_BITCAST %1