Skip to content

Commit 511f7f5

Browse files
committedJul 23, 2019
[AArch64][GlobalISel] Add support for s128 loads, stores, extracts, truncs.
We need to be able to load and store s128 for memcpy inlining, where we want to generate Q register mem ops. Making these legal also requires that we add some support in other instructions. Regbankselect should also know about these since they have no GPR register class that can hold them, so need special handling to live on the FPR bank. Differential Revision: https://reviews.llvm.org/D65166 llvm-svn: 366857
1 parent 78b1e77 commit 511f7f5

12 files changed

+218
-279
lines changed
 

‎llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp

+73-11
Original file line numberDiff line numberDiff line change
@@ -1551,14 +1551,42 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
15511551
return true;
15521552
}
15531553
case TargetOpcode::G_EXTRACT: {
1554-
LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
1555-
LLT DstTy = MRI.getType(I.getOperand(0).getReg());
1554+
Register DstReg = I.getOperand(0).getReg();
1555+
Register SrcReg = I.getOperand(1).getReg();
1556+
LLT SrcTy = MRI.getType(SrcReg);
1557+
LLT DstTy = MRI.getType(DstReg);
15561558
(void)DstTy;
15571559
unsigned SrcSize = SrcTy.getSizeInBits();
1558-
// Larger extracts are vectors, same-size extracts should be something else
1559-
// by now (either split up or simplified to a COPY).
1560-
if (SrcTy.getSizeInBits() > 64 || Ty.getSizeInBits() > 32)
1561-
return false;
1560+
1561+
if (SrcTy.getSizeInBits() > 64) {
1562+
// This should be an extract of an s128, which is like a vector extract.
1563+
if (SrcTy.getSizeInBits() != 128)
1564+
return false;
1565+
// Only support extracting 64 bits from an s128 at the moment.
1566+
if (DstTy.getSizeInBits() != 64)
1567+
return false;
1568+
1569+
const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
1570+
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
1571+
// Check we have the right regbank always.
1572+
assert(SrcRB.getID() == AArch64::FPRRegBankID &&
1573+
DstRB.getID() == AArch64::FPRRegBankID &&
1574+
"Wrong extract regbank!");
1575+
1576+
// Emit the same code as a vector extract.
1577+
// Offset must be a multiple of 64.
1578+
unsigned Offset = I.getOperand(2).getImm();
1579+
if (Offset % 64 != 0)
1580+
return false;
1581+
unsigned LaneIdx = Offset / 64;
1582+
MachineIRBuilder MIB(I);
1583+
MachineInstr *Extract = emitExtractVectorElt(
1584+
DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
1585+
if (!Extract)
1586+
return false;
1587+
I.eraseFromParent();
1588+
return true;
1589+
}
15621590

15631591
I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
15641592
MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
@@ -1570,7 +1598,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
15701598
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
15711599
}
15721600

1573-
Register DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
1601+
DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
15741602
MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
15751603
MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
15761604
.addReg(DstReg, 0, AArch64::sub_32);
@@ -1928,6 +1956,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
19281956
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
19291957
return true;
19301958
}
1959+
1960+
if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
1961+
MachineIRBuilder MIB(I);
1962+
MachineInstr *Extract = emitExtractVectorElt(
1963+
DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
1964+
if (!Extract)
1965+
return false;
1966+
I.eraseFromParent();
1967+
return true;
1968+
}
19311969
}
19321970

19331971
return false;
@@ -2590,16 +2628,40 @@ bool AArch64InstructionSelector::selectMergeValues(
25902628
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
25912629
const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
25922630
assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
2631+
const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
25932632

2594-
// At the moment we only support merging two s32s into an s64.
25952633
if (I.getNumOperands() != 3)
25962634
return false;
2597-
if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
2598-
return false;
2599-
const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
2635+
2636+
// Merging 2 s64s into an s128.
2637+
if (DstTy == LLT::scalar(128)) {
2638+
if (SrcTy.getSizeInBits() != 64)
2639+
return false;
2640+
MachineIRBuilder MIB(I);
2641+
Register DstReg = I.getOperand(0).getReg();
2642+
Register Src1Reg = I.getOperand(1).getReg();
2643+
Register Src2Reg = I.getOperand(2).getReg();
2644+
auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
2645+
MachineInstr *InsMI =
2646+
emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB);
2647+
if (!InsMI)
2648+
return false;
2649+
MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
2650+
Src2Reg, /* LaneIdx */ 1, RB, MIB);
2651+
if (!Ins2MI)
2652+
return false;
2653+
constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
2654+
constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
2655+
I.eraseFromParent();
2656+
return true;
2657+
}
2658+
26002659
if (RB.getID() != AArch64::GPRRegBankID)
26012660
return false;
26022661

2662+
if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
2663+
return false;
2664+
26032665
auto *DstRC = &AArch64::GPR64RegClass;
26042666
Register SubToRegDef = MRI.createVirtualRegister(DstRC);
26052667
MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),

‎llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -193,14 +193,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
193193
.legalIf([=](const LegalityQuery &Query) {
194194
const LLT &Ty0 = Query.Types[0];
195195
const LLT &Ty1 = Query.Types[1];
196-
if (Ty1 != s32 && Ty1 != s64)
196+
if (Ty1 != s32 && Ty1 != s64 && Ty1 != s128)
197197
return false;
198198
if (Ty1 == p0)
199199
return true;
200200
return isPowerOf2_32(Ty0.getSizeInBits()) &&
201201
(Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8);
202202
})
203-
.clampScalar(1, s32, s64)
203+
.clampScalar(1, s32, s128)
204204
.widenScalarToNextPow2(1)
205205
.maxScalarIf(typeInSet(1, {s32}), 0, s16)
206206
.maxScalarIf(typeInSet(1, {s64}), 0, s32)
@@ -238,6 +238,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
238238
{s32, p0, 32, 8},
239239
{s64, p0, 64, 8},
240240
{p0, p0, 64, 8},
241+
{s128, p0, 128, 8},
241242
{v8s8, p0, 64, 8},
242243
{v16s8, p0, 128, 8},
243244
{v4s16, p0, 64, 8},
@@ -267,6 +268,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
267268
{s32, p0, 32, 8},
268269
{s64, p0, 64, 8},
269270
{p0, p0, 64, 8},
271+
{s128, p0, 128, 8},
270272
{v16s8, p0, 128, 8},
271273
{v4s16, p0, 64, 8},
272274
{v8s16, p0, 128, 8},

‎llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp

+15
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
635635
// Some of the floating-point instructions have mixed GPR and FPR operands:
636636
// fine-tune the computed mapping.
637637
switch (Opc) {
638+
case TargetOpcode::G_TRUNC: {
639+
LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
640+
if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128)
641+
OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
642+
break;
643+
}
638644
case TargetOpcode::G_SITOFP:
639645
case TargetOpcode::G_UITOFP:
640646
if (MRI.getType(MI.getOperand(0).getReg()).isVector())
@@ -793,6 +799,15 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
793799
// Index needs to be a GPR.
794800
OpRegBankIdx[3] = PMI_FirstGPR;
795801
break;
802+
case TargetOpcode::G_EXTRACT: {
803+
// For s128 sources we have to use fpr.
804+
LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
805+
if (SrcTy.getSizeInBits() == 128) {
806+
OpRegBankIdx[0] = PMI_FirstFPR;
807+
OpRegBankIdx[1] = PMI_FirstFPR;
808+
}
809+
break;
810+
}
796811
case TargetOpcode::G_BUILD_VECTOR:
797812
// If the first source operand belongs to a FPR register bank, then make
798813
// sure that we preserve that.

‎llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ define void @nonpow2_load_narrowing() {
205205
ret void
206206
}
207207

208-
; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %4:_(s64) = G_EXTRACT %3:_(s96), 0 (in function: nonpow2_store_narrowing)
208+
; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: cannot select: %5:fpr32(s32) = G_EXTRACT %21:fpr(s128), 64 (in function: nonpow2_store_narrowing)
209209
; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_store_narrowing
210210
; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_store_narrowing:
211211
define void @nonpow2_store_narrowing(i96* %c) {

‎llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir

-82
Original file line numberDiff line numberDiff line change
@@ -1,88 +1,6 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
22
# RUN: llc -mtriple=aarch64-linux-gnu -O0 -run-pass=legalizer %s -o - | FileCheck %s
33

4-
---
5-
name: test_extracts_1
6-
body: |
7-
bb.0:
8-
liveins: $w0
9-
10-
; Low part of extraction takes entirity of the low register entirely, so
11-
; value stored is forwarded directly from first load.
12-
13-
; CHECK-LABEL: name: test_extracts_1
14-
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
15-
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
16-
; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY $x2
17-
; CHECK: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY2]](p0) :: (load 8, align 16)
18-
; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
19-
; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY2]], [[C]](s64)
20-
; CHECK: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP]](p0) :: (load 8)
21-
; CHECK: [[COPY3:%[0-9]+]]:_(s64) = COPY [[LOAD]](s64)
22-
; CHECK: G_STORE [[COPY3]](s64), [[COPY2]](p0) :: (store 8)
23-
; CHECK: RET_ReallyLR
24-
%0:_(s64) = COPY $x0
25-
%1:_(s32) = COPY $w1
26-
%2:_(p0) = COPY $x2
27-
%3:_(s128) = G_LOAD %2(p0) :: (load 16)
28-
%4:_(s64) = G_EXTRACT %3(s128), 0
29-
G_STORE %4(s64), %2(p0) :: (store 8)
30-
RET_ReallyLR
31-
...
32-
33-
---
34-
name: test_extracts_2
35-
body: |
36-
bb.0:
37-
liveins: $w0
38-
39-
; Low extraction wipes takes whole low register. High extraction is real.
40-
; CHECK-LABEL: name: test_extracts_2
41-
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
42-
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
43-
; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY $x2
44-
; CHECK: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY2]](p0) :: (load 8, align 16)
45-
; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
46-
; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY2]], [[C]](s64)
47-
; CHECK: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP]](p0) :: (load 8)
48-
; CHECK: [[COPY3:%[0-9]+]]:_(s64) = COPY [[LOAD]](s64)
49-
; CHECK: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD1]](s64), 0
50-
; CHECK: G_STORE [[COPY3]](s64), [[COPY2]](p0) :: (store 8)
51-
; CHECK: G_STORE [[EXTRACT]](s32), [[COPY2]](p0) :: (store 4)
52-
; CHECK: RET_ReallyLR
53-
%0:_(s64) = COPY $x0
54-
%1:_(s32) = COPY $w1
55-
%2:_(p0) = COPY $x2
56-
%3:_(s128) = G_LOAD %2(p0) :: (load 16)
57-
%4:_(s64) = G_EXTRACT %3(s128), 0
58-
%5:_(s32) = G_EXTRACT %3(s128), 64
59-
G_STORE %4(s64), %2(p0) :: (store 8)
60-
G_STORE %5(s32), %2(p0) :: (store 4)
61-
RET_ReallyLR
62-
...
63-
64-
---
65-
name: test_extracts_3
66-
body: |
67-
bb.0:
68-
liveins: $x0, $x1, $x2
69-
70-
71-
; CHECK-LABEL: name: test_extracts_3
72-
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
73-
; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
74-
; CHECK: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY]](s64), 32
75-
; CHECK: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](s64), 0
76-
; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[EXTRACT]](s32), [[EXTRACT1]](s32)
77-
; CHECK: $x0 = COPY [[MV]](s64)
78-
; CHECK: RET_ReallyLR
79-
%0:_(s64) = COPY $x0
80-
%1:_(s64) = COPY $x1
81-
%2:_(s128) = G_MERGE_VALUES %0, %1
82-
%3:_(s64) = G_EXTRACT %2, 32
83-
$x0 = COPY %3
84-
RET_ReallyLR
85-
...
864

875
---
886
name: test_extracts_4

0 commit comments

Comments
 (0)