diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp --- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -75,6 +76,14 @@ bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; + void collectShuffleMaskIndices(MachineInstr &I, MachineRegisterInfo &MRI, + SmallVectorImpl &Idxs) const; + bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const; + + unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const; + MachineInstr *emitLoadFromConstantPool(Constant *CPVal, + MachineIRBuilder &MIRBuilder) const; + ComplexRendererFns selectArithImmed(MachineOperand &Root) const; ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, @@ -1696,6 +1705,8 @@ return selectMergeValues(I, MRI); case TargetOpcode::G_UNMERGE_VALUES: return selectUnmergeValues(I, MRI); + case TargetOpcode::G_SHUFFLE_VECTOR: + return selectShuffleVector(I, MRI); } return false; @@ -1913,6 +1924,125 @@ return true; } +void AArch64InstructionSelector::collectShuffleMaskIndices( + MachineInstr &I, MachineRegisterInfo &MRI, + SmallVectorImpl &Idxs) const { + MachineInstr *MaskDef = MRI.getVRegDef(I.getOperand(3).getReg()); + assert( + MaskDef->getOpcode() == TargetOpcode::G_BUILD_VECTOR && + "G_SHUFFLE_VECTOR should have a constant mask operand as G_BUILD_VECTOR"); + // Find the constant indices. + for (unsigned i = 1, e = MaskDef->getNumOperands(); i < e; ++i) { + MachineInstr *ScalarDef = MRI.getVRegDef(MaskDef->getOperand(i).getReg()); + assert(ScalarDef && "Could not find vreg def of shufflevec index op"); + // Look through copies. + while (ScalarDef->getOpcode() == TargetOpcode::COPY) { + ScalarDef = MRI.getVRegDef(ScalarDef->getOperand(1).getReg()); + assert(ScalarDef && "Could not find def of copy operand"); + } + assert(ScalarDef->getOpcode() == TargetOpcode::G_CONSTANT); + Idxs.push_back(ScalarDef->getOperand(1).getCImm()->getSExtValue()); + } +} + +unsigned +AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal, + MachineFunction &MF) const { + Type *CPTy = CPVal->getType()->getPointerTo(); + unsigned Align = MF.getDataLayout().getPrefTypeAlignment(CPTy); + if (Align == 0) + Align = MF.getDataLayout().getTypeAllocSize(CPTy); + + MachineConstantPool *MCP = MF.getConstantPool(); + return MCP->getConstantPoolIndex(CPVal, Align); +} + +MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( + Constant *CPVal, MachineIRBuilder &MIRBuilder) const { + unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF()); + + auto Adrp = + MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {}) + .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); + auto Load = + MIRBuilder.buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp}) + .addConstantPoolIndex(CPIdx, 0, + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); + constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); + return &*Load; +} + +bool AArch64InstructionSelector::selectShuffleVector( + MachineInstr &I, MachineRegisterInfo &MRI) const { + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + unsigned Src1Reg = I.getOperand(1).getReg(); + const LLT Src1Ty = MRI.getType(Src1Reg); + unsigned Src2Reg = I.getOperand(2).getReg(); + const LLT Src2Ty = MRI.getType(Src2Reg); + + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + LLVMContext &Ctx = MF.getFunction().getContext(); + + // G_SHUFFLE_VECTOR doesn't really have a strictly enforced constant mask + // operand, it comes in as a normal vector value which we have to analyze to + // find the mask indices. + SmallVector Mask; + collectShuffleMaskIndices(I, MRI, Mask); + assert(!Mask.empty() && "Expected to find mask indices"); + + // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if + // it's originated from a <1 x T> type. Those should have been lowered into + // G_BUILD_VECTOR earlier. + if (!Src1Ty.isVector() || !Src2Ty.isVector()) { + LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); + return false; + } + + unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; + + SmallVector CstIdxs; + for (int Val : Mask) { + for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { + unsigned Offset = Byte + Val * BytesPerElt; + CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); + } + } + + if (DstTy.getSizeInBits() != 128) { + assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); + // This case can be done with TBL1. + return false; + } + + // Use a constant pool to load the index vector for TBL. + Constant *CPVal = ConstantVector::get(CstIdxs); + MachineIRBuilder MIRBuilder(I); + MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder); + if (!IndexLoad) { + LLVM_DEBUG(dbgs() << "Could not load from a constant pool"); + return false; + } + + // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive + // Q registers for regalloc. + auto RegSeq = MIRBuilder + .buildInstr(TargetOpcode::REG_SEQUENCE, + {&AArch64::QQRegClass}, {Src1Reg}) + .addImm(AArch64::qsub0) + .addUse(Src2Reg) + .addImm(AArch64::qsub1); + + auto TBL2 = + MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0).getReg()}, + {RegSeq, IndexLoad->getOperand(0).getReg()}); + constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI); + constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + bool AArch64InstructionSelector::selectBuildVector( MachineInstr &I, MachineRegisterInfo &MRI) const { assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -461,6 +461,29 @@ {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) .scalarize(1); + getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) + .legalIf([=](const LegalityQuery &Query) { + const LLT &DstTy = Query.Types[0]; + const LLT &SrcTy = Query.Types[1]; + // For now just support the TBL2 variant which needs the source vectors + // to be the same size as the dest. + if (DstTy != SrcTy) + return false; + ArrayRef SupportedDstTys = {v2s32, v4s32, v2s64}; + for (auto &Ty : SupportedDstTys) { + if (DstTy == Ty) + return true; + } + return false; + }) + // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we + // just want those lowered into G_BUILD_VECTOR + .lowerIf([=](const LegalityQuery &Query) { + return !Query.Types[1].isVector(); + }) + .clampNumElements(0, v4s32, v4s32) + .clampNumElements(0, v2s64, v2s64); + computeTables(); verify(*ST.getInstrInfo()); } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir @@ -0,0 +1,54 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=aarch64 -run-pass=legalizer -global-isel-abort=1 %s -o - | FileCheck %s +--- +name: shuffle_v4i32 +alignment: 2 +tracksRegLiveness: true +body: | + bb.1: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: shuffle_v4i32 + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: $q0 = COPY [[SHUF]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(<4 x s32>) = COPY $q1 + %4:_(s32) = G_CONSTANT i32 0 + %3:_(<4 x s32>) = G_BUILD_VECTOR %4(s32), %4(s32), %4(s32), %4(s32) + %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, %3(<4 x s32>) + $q0 = COPY %2(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: shuffle_v2i64 +alignment: 2 +tracksRegLiveness: true +body: | + bb.1: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: shuffle_v2i64 + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32) + ; CHECK: [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s64>), [[COPY1]], [[BUILD_VECTOR]](<2 x s32>) + ; CHECK: $q0 = COPY [[SHUF]](<2 x s64>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<2 x s64>) = COPY $q0 + %1:_(<2 x s64>) = COPY $q1 + %4:_(s32) = G_CONSTANT i32 0 + %3:_(<2 x s32>) = G_BUILD_VECTOR %4(s32), %4(s32) + %2:_(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %1, %3(<2 x s32>) + $q0 = COPY %2(<2 x s64>) + RET_ReallyLR implicit $q0 + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -313,7 +313,7 @@ # DEBUG: .. type index coverage check SKIPPED: user-defined predicate detected # # DEBUG-NEXT: G_SHUFFLE_VECTOR (opcode {{[0-9]+}}): 3 type indices -# DEBUG: .. type index coverage check SKIPPED: no rules defined +# DEBUG: .. type index coverage check SKIPPED: user-defined predicate detected # # DEBUG-NEXT: G_CTTZ (opcode {{[0-9]+}}): 2 type indices # DEBUG: .. type index coverage check SKIPPED: no rules defined diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir @@ -0,0 +1,151 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs %s -global-isel-abort=1 -o - | FileCheck %s +--- | + ; ModuleID = 'shufflevec-only-legal.ll' + source_filename = "shufflevec-only-legal.ll" + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64" + + define <4 x i32> @shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) { + %shuf = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %shuf + } + + define <4 x i32> @shuffle_tbl_v4i32(<4 x i32> %a, <4 x i32> %b) { + %shuf = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %shuf + } + + define <2 x i64> @shuffle_v2i64(<2 x i64> %a, <2 x i64> %b) { + %shuf = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> zeroinitializer + ret <2 x i64> %shuf + } + +... +--- +name: shuffle_v4i32 +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + - { id: 4, class: gpr } + - { id: 5, class: gpr } + - { id: 6, class: gpr } +body: | + bb.1 (%ir-block.0): + liveins: $q0, $q1 + + ; CHECK-LABEL: name: shuffle_v4i32 + ; CHECK: constants: + ; CHECK: value: '<16 x i8> ' + ; CHECK: alignment: 8 + ; CHECK: isTargetSpecific: false + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1 + ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0 + ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:qq = REG_SEQUENCE [[COPY]], %subreg.qsub0, [[COPY1]], %subreg.qsub1 + ; CHECK: [[TBLv16i8Two:%[0-9]+]]:fpr128 = TBLv16i8Two [[REG_SEQUENCE]], [[LDRQui]] + ; CHECK: $q0 = COPY [[TBLv16i8Two]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(<4 x s32>) = COPY $q0 + %1:fpr(<4 x s32>) = COPY $q1 + %4:gpr(s32) = G_CONSTANT i32 0 + %5:gpr(s32) = G_CONSTANT i32 1 + %6:gpr(s32) = G_CONSTANT i32 3 + %3:fpr(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %4(s32) + %2:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, %3(<4 x s32>) + $q0 = COPY %2(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: shuffle_tbl_v4i32 +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + - { id: 4, class: gpr } + - { id: 5, class: gpr } + - { id: 6, class: gpr } + - { id: 7, class: gpr } +body: | + bb.1 (%ir-block.0): + liveins: $q0, $q1 + + ; CHECK-LABEL: name: shuffle_tbl_v4i32 + ; CHECK: constants: + ; CHECK: value: '<16 x i8> ' + ; CHECK: alignment: 8 + ; CHECK: isTargetSpecific: false + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1 + ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0 + ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:qq = REG_SEQUENCE [[COPY]], %subreg.qsub0, [[COPY1]], %subreg.qsub1 + ; CHECK: [[TBLv16i8Two:%[0-9]+]]:fpr128 = TBLv16i8Two [[REG_SEQUENCE]], [[LDRQui]] + ; CHECK: $q0 = COPY [[TBLv16i8Two]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(<4 x s32>) = COPY $q0 + %1:fpr(<4 x s32>) = COPY $q1 + %4:gpr(s32) = G_CONSTANT i32 5 + %5:gpr(s32) = G_CONSTANT i32 7 + %6:gpr(s32) = G_CONSTANT i32 1 + %7:gpr(s32) = G_CONSTANT i32 0 + %3:fpr(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %7(s32) + %2:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, %3(<4 x s32>) + $q0 = COPY %2(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: shuffle_v2i64 +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + - { id: 4, class: gpr } +body: | + bb.1 (%ir-block.0): + liveins: $q0, $q1 + + ; CHECK-LABEL: name: shuffle_v2i64 + ; CHECK: constants: + ; CHECK: value: '<16 x i8> ' + ; CHECK: alignment: 8 + ; CHECK: isTargetSpecific: false + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1 + ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0 + ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:qq = REG_SEQUENCE [[COPY]], %subreg.qsub0, [[COPY1]], %subreg.qsub1 + ; CHECK: [[TBLv16i8Two:%[0-9]+]]:fpr128 = TBLv16i8Two [[REG_SEQUENCE]], [[LDRQui]] + ; CHECK: $q0 = COPY [[TBLv16i8Two]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(<2 x s64>) = COPY $q0 + %1:fpr(<2 x s64>) = COPY $q1 + %4:gpr(s32) = G_CONSTANT i32 0 + %3:fpr(<2 x s32>) = G_BUILD_VECTOR %4(s32), %4(s32) + %2:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %1, %3(<2 x s32>) + $q0 = COPY %2(<2 x s64>) + RET_ReallyLR implicit $q0 + +...