diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -326,6 +326,9 @@ LegalizeResult fewerElementsVectorReductions(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy); + LegalizeResult fewerElementsVectorShuffle(MachineInstr &MI, unsigned TypeIdx, + LLT NarrowTy); + LegalizeResult narrowScalarShift(MachineInstr &MI, unsigned TypeIdx, LLT Ty); LegalizeResult narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4197,11 +4197,154 @@ return fewerElementsVectorSextInReg(MI, TypeIdx, NarrowTy); GISEL_VECREDUCE_CASES_NONSEQ return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy); + case G_SHUFFLE_VECTOR: + return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy); default: return UnableToLegalize; } } +LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle( + MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + if (TypeIdx != 0) + return UnableToLegalize; + + Register DstReg = MI.getOperand(0).getReg(); + Register Src1Reg = MI.getOperand(1).getReg(); + Register Src2Reg = MI.getOperand(2).getReg(); + ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + LLT DstTy = MRI.getType(DstReg); + LLT Src1Ty = MRI.getType(Src1Reg); + LLT Src2Ty = MRI.getType(Src2Reg); + // The shuffle should be canonicalized by now. + if (DstTy != Src1Ty) + return UnableToLegalize; + if (DstTy != Src2Ty) + return UnableToLegalize; + + if (!isPowerOf2_32(DstTy.getNumElements())) + return UnableToLegalize; + + // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly. + // Further legalization attempts will be needed to do split further. + NarrowTy = DstTy.changeNumElements(DstTy.getNumElements() / 2); + unsigned NewElts = NarrowTy.getNumElements(); + + SmallVector SplitSrc1Regs, SplitSrc2Regs; + extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs); + extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs); + Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0], + SplitSrc2Regs[1]}; + + Register Hi, Lo; + + // If Lo or Hi uses elements from at most two of the four input vectors, then + // express it as a vector shuffle of those two inputs. Otherwise extract the + // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR. + SmallVector Ops; + for (unsigned High = 0; High < 2; ++High) { + Register &Output = High ? Hi : Lo; + + // Build a shuffle mask for the output, discovering on the fly which + // input vectors to use as shuffle operands (recorded in InputUsed). + // If building a suitable shuffle vector proves too hard, then bail + // out with useBuildVector set. + unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered. + unsigned FirstMaskIdx = High * NewElts; + bool UseBuildVector = false; + for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) { + // The mask element. This indexes into the input. + int Idx = Mask[FirstMaskIdx + MaskOffset]; + + // The input vector this mask element indexes into. + unsigned Input = (unsigned)Idx / NewElts; + + if (Input >= array_lengthof(Inputs)) { + // The mask element does not index into any input vector. + Ops.push_back(-1); + continue; + } + + // Turn the index into an offset from the start of the input vector. + Idx -= Input * NewElts; + + // Find or create a shuffle vector operand to hold this input. + unsigned OpNo; + for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { + if (InputUsed[OpNo] == Input) { + // This input vector is already an operand. + break; + } else if (InputUsed[OpNo] == -1U) { + // Create a new operand for this input vector. + InputUsed[OpNo] = Input; + break; + } + } + + if (OpNo >= array_lengthof(InputUsed)) { + // More than two input vectors used! Give up on trying to create a + // shuffle vector. Insert all elements into a BUILD_VECTOR instead. + UseBuildVector = true; + break; + } + + // Add the mask index for the new shuffle vector. + Ops.push_back(Idx + OpNo * NewElts); + } + + if (UseBuildVector) { + LLT EltTy = NarrowTy.getElementType(); + SmallVector SVOps; + + // Extract the input elements by hand. + for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) { + // The mask element. This indexes into the input. + int Idx = Mask[FirstMaskIdx + MaskOffset]; + + // The input vector this mask element indexes into. + unsigned Input = (unsigned)Idx / NewElts; + + if (Input >= array_lengthof(Inputs)) { + // The mask element is "undef" or indexes off the end of the input. + SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0)); + continue; + } + + // Turn the index into an offset from the start of the input vector. + Idx -= Input * NewElts; + + // Extract the vector element by hand. + SVOps.push_back(MIRBuilder + .buildExtractVectorElement( + EltTy, Inputs[Input], + MIRBuilder.buildConstant(LLT::scalar(32), Idx)) + .getReg(0)); + } + + // Construct the Lo/Hi output using a G_BUILD_VECTOR. + Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0); + } else if (InputUsed[0] == -1U) { + // No input vectors were used! The result is undefined. + Output = MIRBuilder.buildUndef(NarrowTy).getReg(0); + } else { + Register Op0 = Inputs[InputUsed[0]]; + // If only one input was used, use an undefined vector for the other. + Register Op1 = InputUsed[1] == -1U + ? MIRBuilder.buildUndef(NarrowTy).getReg(0) + : Inputs[InputUsed[1]]; + // At least one input vector was used. Create a new shuffle vector. + Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0); + } + + Ops.clear(); + } + + MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi}); + MI.eraseFromParent(); + return Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions( MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) { unsigned Opc = MI.getOpcode(); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir @@ -136,3 +136,79 @@ RET_ReallyLR implicit $d0, implicit $d1 ... +--- +name: oversize_shuffle_v4i64 +alignment: 4 +tracksRegLiveness: true +body: | + bb.1: + liveins: $q0, $q1, $q2, $q3, $x0 + + ; CHECK-LABEL: name: oversize_shuffle_v4i64 + ; CHECK: liveins: $q0, $q1, $q2, $q3, $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3 + ; CHECK: [[COPY4:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[COPY1]](<2 x s64>), [[COPY2]], shufflemask(1, 2) + ; CHECK: [[SHUF1:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[COPY3]](<2 x s64>), [[COPY]], shufflemask(1, 2) + ; CHECK: G_STORE [[SHUF]](<2 x s64>), [[COPY4]](p0) :: (store 16, align 32) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY4]], [[C]](s64) + ; CHECK: G_STORE [[SHUF1]](<2 x s64>), [[PTR_ADD]](p0) :: (store 16 into unknown-address + 16) + ; CHECK: RET_ReallyLR + %3:_(<2 x s64>) = COPY $q0 + %4:_(<2 x s64>) = COPY $q1 + %0:_(<4 x s64>) = G_CONCAT_VECTORS %3(<2 x s64>), %4(<2 x s64>) + %5:_(<2 x s64>) = COPY $q2 + %6:_(<2 x s64>) = COPY $q3 + %1:_(<4 x s64>) = G_CONCAT_VECTORS %5(<2 x s64>), %6(<2 x s64>) + %2:_(p0) = COPY $x0 + %7:_(<4 x s64>) = G_SHUFFLE_VECTOR %0(<4 x s64>), %1, shufflemask(3, 4, 7, 0) + G_STORE %7(<4 x s64>), %2(p0) :: (store 32) + RET_ReallyLR + +... +--- +name: oversize_shuffle_v8i32_build_vector +alignment: 4 +tracksRegLiveness: true +body: | + bb.1: + liveins: $q0, $q1, $q2, $q3, $x0 + + ; CHECK-LABEL: name: oversize_shuffle_v8i32_build_vector + ; CHECK: liveins: $q0, $q1, $q2, $q3, $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $q2 + ; CHECK: [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $q3 + ; CHECK: [[COPY4:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C]](s64) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s32>), [[C1]](s64) + ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY2]](<4 x s32>), [[C2]](s64) + ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CHECK: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY3]](<4 x s32>), [[C3]](s64) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[EVEC]](s32), [[EVEC1]](s32), [[EVEC2]](s32), [[EVEC3]](s32) + ; CHECK: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY1]](<4 x s32>), [[COPY]], shufflemask(2, 6, 5, 3) + ; CHECK: G_STORE [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](p0) :: (store 16, align 32) + ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY4]], [[C4]](s64) + ; CHECK: G_STORE [[SHUF]](<4 x s32>), [[PTR_ADD]](p0) :: (store 16 into unknown-address + 16) + ; CHECK: RET_ReallyLR + %3:_(<4 x s32>) = COPY $q0 + %4:_(<4 x s32>) = COPY $q1 + %0:_(<8 x s32>) = G_CONCAT_VECTORS %3(<4 x s32>), %4(<4 x s32>) + %5:_(<4 x s32>) = COPY $q2 + %6:_(<4 x s32>) = COPY $q3 + %1:_(<8 x s32>) = G_CONCAT_VECTORS %5(<4 x s32>), %6(<4 x s32>) + %2:_(p0) = COPY $x0 + %7:_(<8 x s32>) = G_SHUFFLE_VECTOR %0(<8 x s32>), %1, shufflemask(0, 5, 10, 15, 6, 2, 1, 7) + G_STORE %7(<8 x s32>), %2(p0) :: (store 32) + RET_ReallyLR + +...