Index: llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h +++ llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h @@ -367,13 +367,14 @@ bool tryFoldUnmergeCast(MachineInstr &MI, MachineInstr &CastMI, SmallVectorImpl &DeadInsts, - SmallVectorImpl &UpdatedDefs) { + SmallVectorImpl &UpdatedDefs, + GISelChangeObserver &Observer) { assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES); const unsigned CastOpc = CastMI.getOpcode(); - if (!isArtifactCast(CastOpc)) + if (!isArtifactCast(CastOpc) && CastOpc != TargetOpcode::G_BITCAST) return false; const unsigned NumDefs = MI.getNumOperands() - 1; @@ -455,6 +456,30 @@ } } + if (CastOpc == TargetOpcode::G_BITCAST) { + MachineInstr *MergeI = MRI.getVRegDef(CastMI.getOperand(1).getReg()); + if (!isa(MergeI)) + return false; + // %2:_(s64) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + // %3:_(<2 x s32>) = G_BITCAST %2:_(s64) + // %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3:_(<2 x s32>) + // + // %4:_(s32) = COPY %0:_(s32) + // %5:_(s32) = COPY %1:_(s32) + if (DestTy != MRI.getType(MergeI->getOperand(1).getReg())) + return false; + + for (unsigned Idx = 0; Idx < NumDefs; ++Idx) { + Register DstReg = MI.getOperand(Idx).getReg(); + Register SrcReg = MergeI->getOperand(Idx + 1).getReg(); + replaceRegOrBuildCopy(DstReg, SrcReg, MRI, Builder, UpdatedDefs, + Observer); + } + + markInstAndDefDead(MI, CastMI, DeadInsts); + return true; + } + // TODO: support combines with other casts as well return false; } @@ -888,7 +913,7 @@ ConvertOp, OpTy, DestTy)) { // We might have a chance to combine later by trying to combine // unmerge(cast) first - return tryFoldUnmergeCast(MI, *SrcDef, DeadInsts, UpdatedDefs); + return tryFoldUnmergeCast(MI, *SrcDef, DeadInsts, UpdatedDefs, Observer); } const unsigned NumMergeRegs = MergeI->getNumOperands() - 1; Index: llvm/test/CodeGen/AArch64/fold-global-offsets.ll =================================================================== --- llvm/test/CodeGen/AArch64/fold-global-offsets.ll +++ llvm/test/CodeGen/AArch64/fold-global-offsets.ll @@ -130,9 +130,6 @@ ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: adrp x8, x3+88 ; GISEL-NEXT: add x8, x8, :lo12:x3+88 -; GISEL-NEXT: mov v0.d[1], x8 -; GISEL-NEXT: mov d0, v0.d[1] -; GISEL-NEXT: fmov x8, d0 ; GISEL-NEXT: ldr w0, [x8, #20] ; GISEL-NEXT: ret Index: llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-extract.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-extract.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-extract.mir @@ -325,12 +325,9 @@ ; CHECK-LABEL: name: extract_s64_build_vector_v2s64_s64_s64_offset32 ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64) - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<2 x s64>) - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BITCAST]](s128) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV]], [[C2]](s32) - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[C2]](s32) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[C]], [[C2]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[C1]], [[C2]](s32) ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL]] ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64) %0:_(s64) = G_CONSTANT i64 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir @@ -1557,11 +1557,8 @@ ; CHECK-LABEL: name: test_unmerge_values_look_through_scalar_to_vector_bitcast ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[MV]](s64) - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) - ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) - ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[COPY1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s64) = G_MERGE_VALUES %0:_(s32), %1:_(s32) @@ -1579,11 +1576,8 @@ ; CHECK-LABEL: name: test_unmerge_values_look_through_vector_to_scalar_bitcast ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s64) - ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) - ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[COPY1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(<2 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir @@ -70,15 +70,9 @@ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[UV]](s32) ; CHECK-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[UV1]](s32) - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) - ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[BITCAST1]](s32), [[LSHR]](s32) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) + ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC1]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s16>) = G_FPTRUNC %0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir @@ -169,16 +169,8 @@ ; GFX8-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV3]], 16 ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG2]], [[SEXT_INREG3]] ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[MUL1]], [[C]](s32) - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C1]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ASHR1]], [[C1]] - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST1]], 16 - ; GFX8-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16 + ; GFX8-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16 + ; GFX8-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR1]], 16 ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG4]](s32), [[SEXT_INREG5]](s32) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_smulh_v2s16 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulo.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulo.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulo.mir @@ -216,22 +216,13 @@ ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG3]], [[SEXT_INREG4]] ; GFX8-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[MUL1]], 16 ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[MUL1]](s32), [[SEXT_INREG5]] - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[MUL1]], [[C]] - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1) ; GFX8-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1) ; GFX8-NEXT: [[SEXT_INREG6:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT]], 1 ; GFX8-NEXT: [[SEXT_INREG7:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT1]], 1 ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG6]](s32), [[SEXT_INREG7]](s32) - ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX8-NEXT: [[SEXT_INREG8:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST1]], 16 - ; GFX8-NEXT: [[SEXT_INREG9:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16 + ; GFX8-NEXT: [[SEXT_INREG8:%[0-9]+]]:_(s32) = G_SEXT_INREG [[MUL]], 16 + ; GFX8-NEXT: [[SEXT_INREG9:%[0-9]+]]:_(s32) = G_SEXT_INREG [[MUL1]], 16 ; GFX8-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG8]](s32), [[SEXT_INREG9]](s32) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s32>) ; GFX8-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s32>) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulh.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulh.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulh.mir @@ -374,14 +374,7 @@ ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[MUL1]], [[C1]](s32) ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C]] ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C]] - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C1]](s32) - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]] - ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) - ; GFX8-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX8-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C]] - ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C]] - ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND6]](s32), [[AND7]](s32) + ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_umulh_v2s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulo.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulo.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulo.mir @@ -557,23 +557,15 @@ ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[AND4]] ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[MUL1]], [[C]] ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[MUL1]](s32), [[AND5]] - ; GFX8-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C]] - ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[MUL1]], [[C]] - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C1]](s32) - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL]] - ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1) ; GFX8-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1) - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX8-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] - ; GFX8-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] - ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND8]](s32), [[AND9]](s32) - ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX8-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C]] - ; GFX8-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C]] - ; GFX8-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND10]](s32), [[AND11]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX8-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]] + ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]] + ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND6]](s32), [[AND7]](s32) + ; GFX8-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C]] + ; GFX8-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[MUL1]], [[C]] + ; GFX8-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND8]](s32), [[AND9]](s32) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s32>) ; GFX8-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_umulo_v2s16