Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1537,17 +1537,36 @@ MachineIRBuilder &B) const { // TODO: Should move some of this into LegalizerHelper. - // TODO: Promote dynamic indexing of s16 to s32 - // TODO: Dynamic s64 indexing is only legal for SGPR. - Optional IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); - if (!IdxVal) // Dynamic case will be selected to register indexing. - return true; - Register Dst = MI.getOperand(0).getReg(); Register Vec = MI.getOperand(1).getReg(); Register Ins = MI.getOperand(2).getReg(); - + Register Idx = MI.getOperand(3).getReg(); LLT VecTy = MRI.getType(Vec); + + // TODO: Promote dynamic indexing of s16 to s32 + // TODO: Dynamic s64 indexing is only legal for SGPR. + Optional IdxVal = getConstantVRegVal(Idx, MRI); + if (!IdxVal) { + // Dynamic case will be selected to register indexing. + if (VecTy.getNumElements() != 2) + return true; + + // Compare and select is a lot cheaper than indexing in small cases. + LLT EltTy = VecTy.getElementType(); + assert(MRI.getType(Ins) == EltTy); + + B.setInstr(MI); + auto Zero = B.buildConstant(LLT::scalar(32), 0); + auto Eq0 = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), Idx, Zero); + auto Unmerge = B.buildUnmerge(EltTy, Vec); + + auto Lo = B.buildSelect(EltTy, Eq0, Ins, Unmerge.getReg(0)); + auto Hi = B.buildSelect(EltTy, Eq0, Unmerge.getReg(1), Ins); + B.buildBuildVector(Dst, {Lo.getReg(0), Hi.getReg(0)}); + MI.eraseFromParent(); + return true; + } + LLT EltTy = VecTy.getElementType(); assert(EltTy == MRI.getType(Ins)); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir @@ -67,8 +67,13 @@ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr3_vgpr4 ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64) - ; CHECK: [[IVEC:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[TRUNC]](s32) - ; CHECK: $vgpr0_vgpr1 = COPY [[IVEC]](<2 x s32>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[TRUNC]](s32), [[C]] + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[UV]] + ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[COPY1]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(s32) = COPY $vgpr2 %2:_(s64) = COPY $vgpr3_vgpr4 @@ -131,8 +136,13 @@ ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C1]](s32) ; CHECK: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]](s32) - ; CHECK: [[IVEC:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[ASHR]](s32) - ; CHECK: $vgpr0_vgpr1 = COPY [[IVEC]](<2 x s32>) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ASHR]](s32), [[C2]] + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[UV]] + ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[COPY1]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(s32) = COPY $vgpr2 %2:_(s8) = G_CONSTANT i8 0