Index: llvm/lib/CodeGen/GlobalISel/CallLowering.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -484,24 +484,32 @@ LLT DstTy = MRI.getType(DstRegs[0]); LLT LCMTy = getLCMType(SrcTy, PartTy); - const unsigned LCMSize = LCMTy.getSizeInBits(); const unsigned DstSize = DstTy.getSizeInBits(); const unsigned SrcSize = SrcTy.getSizeInBits(); + unsigned CoveringSize = LCMTy.getSizeInBits(); Register UnmergeSrc = SrcReg; - if (LCMSize != SrcSize) { - // Widen to the common type. - Register Undef = B.buildUndef(SrcTy).getReg(0); - SmallVector MergeParts(1, SrcReg); - for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize) - MergeParts.push_back(Undef); - - UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0); + + if (CoveringSize != SrcSize) { + // For scalars, it's common to be able to use a simple extension. + if (SrcTy.isScalar() && DstTy.isScalar()) { + CoveringSize = alignTo(SrcSize, DstSize); + LLT CoverTy = LLT::scalar(CoveringSize); + UnmergeSrc = B.buildInstr(ExtendOp, {CoverTy}, {SrcReg}).getReg(0); + } else { + // Widen to the common type. + // FIXME: This should respect the extend type + Register Undef = B.buildUndef(SrcTy).getReg(0); + SmallVector MergeParts(1, SrcReg); + for (unsigned Size = SrcSize; Size != CoveringSize; Size += SrcSize) + MergeParts.push_back(Undef); + UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0); + } } // Unmerge to the original registers and pad with dead defs. SmallVector UnmergeResults(DstRegs.begin(), DstRegs.end()); - for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize; + for (unsigned Size = DstSize * DstRegs.size(); Size != CoveringSize; Size += DstSize) { UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy)); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -1121,9 +1121,8 @@ ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[DEF1:%[0-9]+]]:_(s48) = G_IMPLICIT_DEF - ; CHECK: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[LOAD]](s48), [[DEF1]](s48) - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s96) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s48) + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ANYEXT]](s64) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg @@ -1181,9 +1180,8 @@ ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[DEF1:%[0-9]+]]:_(s48) = G_IMPLICIT_DEF - ; CHECK: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[LOAD]](s48), [[DEF1]](s48) - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s96) + ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD]](s48) + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SEXT]](s64) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg @@ -1241,9 +1239,8 @@ ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[DEF1:%[0-9]+]]:_(s48) = G_IMPLICIT_DEF - ; CHECK: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[LOAD]](s48), [[DEF1]](s48) - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s96) + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s48) + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s64) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg