Skip to content

Commit 9deef20

Browse files
committedMar 2, 2018
[ARM] Fix codegen for VLD3/VLD4/VST3/VST4 with WB
Code generation of VLD3, VLD4, VST3 and VST4 with register writeback is broken due to 2 separate bugs: 1) VLD1d64TPseudoWB_register and VLD1d64QPseudoWB_register are missing rules to expand them to non pseudo MIR. These are selected for ARMISD::VLD3_UPD/VLD4_UPD with v1i64 vectors in SelectVLD. 2) Selection of the right VLD/VST instruction is broken for load and store of 3 and 4 v1i64 vectors. SelectVLD and SelectVST are called with MIR opcode for fixed writeback (ie increment is access size) and call getVLDSTRegisterUpdateOpcode() to select an opcode with register writeback if base register update is of a different size. Since getVLDSTRegisterUpdateOpcode() only knows about VLD1/VLD2/VST1/VST2 the call is currently conditional on the number of element in the vector. However, VLD1/VST1 is selected by SelectVLD/SelectVST's caller for load and stores of 3 or 4 v1i64 vectors. Therefore the opcode is not updated which later lead to a fixed writeback instruction being constructed with an extra operand for the register writeback. This patch addresses the two issues as follows: - it adds the necessary mapping from VLD1d64TPseudoWB_register and VLD1d64QPseudoWB_register to VLD1d64Twb_register and VLD1d64Qwb_register respectively. Like for the existing _fixed variants, the cost of these is bumped for unaligned access. - it changes the logic in SelectVLD and SelectVSD to call isVLDfixed and isVSTfixed respectively to decide whether the opcode should be updated. It also reworks the logic and comments for pushing the writeback offset operand and r0 operand to clarify the logic: writeback offset needs to be pushed if it's a register writeback, r0 needs to be pushed if not and the instruction is a VLD1/VLD2/VST1/VST2. Reviewers: rengolin, t.p.northover, samparker Reviewed By: samparker Patch by Thomas Preud'homme <thomas.preudhomme@arm.com> Differential Revision: https://reviews.llvm.org/D42970 llvm-svn: 326570
1 parent ddf6a33 commit 9deef20

File tree

7 files changed

+75
-16
lines changed

7 files changed

+75
-16
lines changed
 

‎llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -4214,6 +4214,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
42144214
case ARM::VLD3d32Pseudo:
42154215
case ARM::VLD1d64TPseudo:
42164216
case ARM::VLD1d64TPseudoWB_fixed:
4217+
case ARM::VLD1d64TPseudoWB_register:
42174218
case ARM::VLD3d8Pseudo_UPD:
42184219
case ARM::VLD3d16Pseudo_UPD:
42194220
case ARM::VLD3d32Pseudo_UPD:
@@ -4231,6 +4232,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
42314232
case ARM::VLD4d32Pseudo:
42324233
case ARM::VLD1d64QPseudo:
42334234
case ARM::VLD1d64QPseudoWB_fixed:
4235+
case ARM::VLD1d64QPseudoWB_register:
42344236
case ARM::VLD4d8Pseudo_UPD:
42354237
case ARM::VLD4d16Pseudo_UPD:
42364238
case ARM::VLD4d32Pseudo_UPD:

‎llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,10 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
156156

157157
{ ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, false, SingleSpc, 4, 1 ,false},
158158
{ ARM::VLD1d64QPseudoWB_fixed, ARM::VLD1d64Qwb_fixed, true, true, false, SingleSpc, 4, 1 ,false},
159+
{ ARM::VLD1d64QPseudoWB_register, ARM::VLD1d64Qwb_register, true, true, true, SingleSpc, 4, 1 ,false},
159160
{ ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, false, SingleSpc, 3, 1 ,false},
160161
{ ARM::VLD1d64TPseudoWB_fixed, ARM::VLD1d64Twb_fixed, true, true, false, SingleSpc, 3, 1 ,false},
162+
{ ARM::VLD1d64TPseudoWB_register, ARM::VLD1d64Twb_register, true, true, true, SingleSpc, 3, 1 ,false},
161163

162164
{ ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true},
163165
{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true},
@@ -1503,6 +1505,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
15031505
case ARM::VLD3d32Pseudo:
15041506
case ARM::VLD1d64TPseudo:
15051507
case ARM::VLD1d64TPseudoWB_fixed:
1508+
case ARM::VLD1d64TPseudoWB_register:
15061509
case ARM::VLD3d8Pseudo_UPD:
15071510
case ARM::VLD3d16Pseudo_UPD:
15081511
case ARM::VLD3d32Pseudo_UPD:
@@ -1520,6 +1523,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
15201523
case ARM::VLD4d32Pseudo:
15211524
case ARM::VLD1d64QPseudo:
15221525
case ARM::VLD1d64QPseudoWB_fixed:
1526+
case ARM::VLD1d64QPseudoWB_register:
15231527
case ARM::VLD4d8Pseudo_UPD:
15241528
case ARM::VLD4d16Pseudo_UPD:
15251529
case ARM::VLD4d32Pseudo_UPD:

‎llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp

+19-16
Original file line numberDiff line numberDiff line change
@@ -1794,15 +1794,17 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
17941794
Ops.push_back(Align);
17951795
if (isUpdating) {
17961796
SDValue Inc = N->getOperand(AddrOpIdx + 1);
1797-
// FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0
1798-
// case entirely when the rest are updated to that form, too.
17991797
bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs);
1800-
if ((NumVecs <= 2) && !IsImmUpdate)
1801-
Opc = getVLDSTRegisterUpdateOpcode(Opc);
1802-
// FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
1803-
// check for that explicitly too. Horribly hacky, but temporary.
1804-
if ((NumVecs > 2 && !isVLDfixed(Opc)) || !IsImmUpdate)
1805-
Ops.push_back(IsImmUpdate ? Reg0 : Inc);
1798+
if (!IsImmUpdate) {
1799+
// We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
1800+
// check for the opcode rather than the number of vector elements.
1801+
if (isVLDfixed(Opc))
1802+
Opc = getVLDSTRegisterUpdateOpcode(Opc);
1803+
Ops.push_back(Inc);
1804+
// VLD1/VLD2 fixed increment does not need Reg0 so only include it in
1805+
// the operands if not such an opcode.
1806+
} else if (!isVLDfixed(Opc))
1807+
Ops.push_back(Reg0);
18061808
}
18071809
Ops.push_back(Pred);
18081810
Ops.push_back(Reg0);
@@ -1948,16 +1950,17 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
19481950
Ops.push_back(Align);
19491951
if (isUpdating) {
19501952
SDValue Inc = N->getOperand(AddrOpIdx + 1);
1951-
// FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0
1952-
// case entirely when the rest are updated to that form, too.
19531953
bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs);
1954-
if (NumVecs <= 2 && !IsImmUpdate)
1955-
Opc = getVLDSTRegisterUpdateOpcode(Opc);
1956-
// FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
1957-
// check for that explicitly too. Horribly hacky, but temporary.
1958-
if (!IsImmUpdate)
1954+
if (!IsImmUpdate) {
1955+
// We use a VST1 for v1i64 even if the pseudo says VST2/3/4, so
1956+
// check for the opcode rather than the number of vector elements.
1957+
if (isVSTfixed(Opc))
1958+
Opc = getVLDSTRegisterUpdateOpcode(Opc);
19591959
Ops.push_back(Inc);
1960-
else if (NumVecs > 2 && !isVSTfixed(Opc))
1960+
}
1961+
// VST1/VST2 fixed increment does not need Reg0 so only include it in
1962+
// the operands if not such an opcode.
1963+
else if (!isVSTfixed(Opc))
19611964
Ops.push_back(Reg0);
19621965
}
19631966
Ops.push_back(SrcReg);

‎llvm/test/CodeGen/ARM/vld3.ll

+13
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,19 @@ define <1 x i64> @vld3i64_update(i64** %ptr, i64* %A) nounwind {
9696
ret <1 x i64> %tmp4
9797
}
9898

99+
define <1 x i64> @vld3i64_reg_update(i64** %ptr, i64* %A) nounwind {
100+
;CHECK-LABEL: vld3i64_reg_update:
101+
;CHECK: vld1.64 {d16, d17, d18}, [{{r[0-9]+|lr}}:64], {{r[0-9]+|lr}}
102+
%tmp0 = bitcast i64* %A to i8*
103+
%tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8* %tmp0, i32 16)
104+
%tmp5 = getelementptr i64, i64* %A, i32 1
105+
store i64* %tmp5, i64** %ptr
106+
%tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
107+
%tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2
108+
%tmp4 = add <1 x i64> %tmp2, %tmp3
109+
ret <1 x i64> %tmp4
110+
}
111+
99112
define <16 x i8> @vld3Qi8(i8* %A) nounwind {
100113
;CHECK-LABEL: vld3Qi8:
101114
;Check the alignment value. Max for this instruction is 64 bits:

‎llvm/test/CodeGen/ARM/vld4.ll

+13
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,19 @@ define <1 x i64> @vld4i64_update(i64** %ptr, i64* %A) nounwind {
9696
ret <1 x i64> %tmp4
9797
}
9898

99+
define <1 x i64> @vld4i64_reg_update(i64** %ptr, i64* %A) nounwind {
100+
;CHECK-LABEL: vld4i64_reg_update:
101+
;CHECK: vld1.64 {d16, d17, d18, d19}, [{{r[0-9]+|lr}}:256], {{r[0-9]+|lr}}
102+
%tmp0 = bitcast i64* %A to i8*
103+
%tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8* %tmp0, i32 64)
104+
%tmp5 = getelementptr i64, i64* %A, i32 1
105+
store i64* %tmp5, i64** %ptr
106+
%tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
107+
%tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2
108+
%tmp4 = add <1 x i64> %tmp2, %tmp3
109+
ret <1 x i64> %tmp4
110+
}
111+
99112
define <16 x i8> @vld4Qi8(i8* %A) nounwind {
100113
;CHECK-LABEL: vld4Qi8:
101114
;Check the alignment value. Max for this instruction is 256 bits:

‎llvm/test/CodeGen/ARM/vst3.ll

+12
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,18 @@ define void @vst3i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
7373
ret void
7474
}
7575

76+
define void @vst3i64_reg_update(i64** %ptr, <1 x i64>* %B) nounwind {
77+
;CHECK-LABEL: vst3i64_reg_update
78+
;CHECK: vst1.64 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}], r{{.*}}
79+
%A = load i64*, i64** %ptr
80+
%tmp0 = bitcast i64* %A to i8*
81+
%tmp1 = load <1 x i64>, <1 x i64>* %B
82+
call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
83+
%tmp2 = getelementptr i64, i64* %A, i32 1
84+
store i64* %tmp2, i64** %ptr
85+
ret void
86+
}
87+
7688
define void @vst3Qi8(i8* %A, <16 x i8>* %B) nounwind {
7789
;CHECK-LABEL: vst3Qi8:
7890
;Check the alignment value. Max for this instruction is 64 bits:

‎llvm/test/CodeGen/ARM/vst4.ll

+12
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,18 @@ define void @vst4i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
7272
ret void
7373
}
7474

75+
define void @vst4i64_reg_update(i64** %ptr, <1 x i64>* %B) nounwind {
76+
;CHECK-LABEL: vst4i64_reg_update:
77+
;CHECK: vst1.64 {d16, d17, d18, d19}, [r{{[0-9]+}}], r{{[0-9]+}}
78+
%A = load i64*, i64** %ptr
79+
%tmp0 = bitcast i64* %A to i8*
80+
%tmp1 = load <1 x i64>, <1 x i64>* %B
81+
call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
82+
%tmp2 = getelementptr i64, i64* %A, i32 1
83+
store i64* %tmp2, i64** %ptr
84+
ret void
85+
}
86+
7587
define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind {
7688
;CHECK-LABEL: vst4Qi8:
7789
;Check the alignment value. Max for this instruction is 256 bits:

0 commit comments

Comments
 (0)
Please sign in to comment.