Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1342,11 +1342,17 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width, const TargetRegisterInfo *TRI) const { - // Handle only loads/stores with base register followed by immediate offset. - if (LdSt->getNumOperands() != 3) - return false; - if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm()) + // Handle only loads/stores with base register followed by immediate offset + // or pairs. + if (LdSt->getNumOperands() == 3) { + if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm()) + return false; + } else if (LdSt->getNumOperands() == 4) { + if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isReg() || !LdSt->getOperand(3).isImm()) + return false; + } else { return false; + } // Offset is calculated as the immediate operand multiplied by the scaling factor. // Unscaled instructions have scaling factor set to 1. @@ -1392,10 +1398,22 @@ Width = 1; Scale = 1; break; + case AArch64::LDPQi: + case AArch64::STPQi: + Scale = Width = 32; + break; + case AArch64::LDPXi: + case AArch64::LDPDi: + case AArch64::STPXi: + case AArch64::STPDi: case AArch64::LDRQui: case AArch64::STRQui: Scale = Width = 16; break; + case AArch64::LDPWi: + case AArch64::STPWi: + case AArch64::LDPSi: + case AArch64::STPSi: case AArch64::LDRXui: case AArch64::LDRDui: case AArch64::STRXui: @@ -1422,8 +1440,13 @@ break; } - BaseReg = LdSt->getOperand(1).getReg(); - Offset = LdSt->getOperand(2).getImm() * Scale; + if (LdSt->getNumOperands() == 3) { + BaseReg = LdSt->getOperand(1).getReg(); + Offset = LdSt->getOperand(2).getImm() * Scale; + } else if (LdSt->getNumOperands() == 4) { + BaseReg = LdSt->getOperand(2).getReg(); + Offset = LdSt->getOperand(3).getImm() * Scale; + } return true; } Index: llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll +++ llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll @@ -67,17 +67,19 @@ ; CHECK-LABEL: f2: ; CHECK-EVEN: fmadd [[x:d[0-9]*[02468]]] -; CHECK-EVEN: fmul [[y:d[0-9]*[13579]]] ; CHECK-ODD: fmadd [[x:d[0-9]*[13579]]] +; CHECK-A57: fmadd [[x]] +; CHECK-EVEN: fmul [[y:d[0-9]*[13579]]] ; CHECK-ODD: fmul [[y:d[0-9]*[02468]]] -; CHECK: fmadd [[x]] ; CHECK: fmadd [[y]] +; CHECK-A53: fmadd [[x]] +; CHECK-A53: fmadd [[y]] ; CHECK: fmsub [[x]] -; CHECK: fmadd [[y]] +; CHECK-A57: fmadd [[y]] +; CHECK-A53-DAG: str [[y]] ; CHECK: fmadd [[x]] ; CHECK-A57: stp [[x]], [[y]] ; CHECK-A53-DAG: str [[x]] -; CHECK-A53-DAG: str [[y]] define void @f2(double* nocapture readonly %p, double* nocapture %q) #0 { entry: @@ -162,17 +164,19 @@ ; CHECK-LABEL: f4: ; CHECK-EVEN: fmadd [[x:s[0-9]*[02468]]] -; CHECK-EVEN: fmul [[y:s[0-9]*[13579]]] ; CHECK-ODD: fmadd [[x:s[0-9]*[13579]]] +; CHECK-A57: fmadd [[x]] +; CHECK-EVEN: fmul [[y:s[0-9]*[13579]]] ; CHECK-ODD: fmul [[y:s[0-9]*[02468]]] -; CHECK: fmadd [[x]] ; CHECK: fmadd [[y]] +; CHECK-A53: fmadd [[x]] +; CHECK-A53: fmadd [[y]] ; CHECK: fmsub [[x]] -; CHECK: fmadd [[y]] +; CHECK-A57: fmadd [[y]] +; CHECK-A53-DAG: str [[y]] ; CHECK: fmadd [[x]] ; CHECK-A57: stp [[x]], [[y]] ; CHECK-A53-DAG: str [[x]] -; CHECK-A53-DAG: str [[y]] define void @f4(float* nocapture readonly %p, float* nocapture %q) #0 { entry: Index: llvm/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll +++ llvm/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll @@ -8,8 +8,8 @@ define void @test(<16 x float>* noalias sret %agg.result) nounwind ssp { entry: ; CHECK: test -; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32] ; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp] +; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32] ; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE:x[0-9]+]], #32] ; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE]]] %retval = alloca <16 x float>, align 16 Index: llvm/test/CodeGen/AArch64/fastcc.ll =================================================================== --- llvm/test/CodeGen/AArch64/fastcc.ll +++ llvm/test/CodeGen/AArch64/fastcc.ll @@ -20,7 +20,7 @@ ; CHECK-NOT: sub sp, sp, ; CHECK-TAIL: bl func_stack8 -; CHECK-TAIL: sub sp, sp, #16 +; CHECK-TAIL: stp xzr, xzr, [sp, #-16]! call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9) @@ -70,7 +70,7 @@ ; CHECK-TAIL: bl func_stack8 -; CHECK-TAIL: sub sp, sp, #16 +; CHECK-TAIL: stp xzr, xzr, [sp, #-16]! call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9) @@ -113,7 +113,7 @@ ; CHECK-NOT: sub sp, sp, ; CHECK-TAIL: bl func_stack8 -; CHECK-TAIL: sub sp, sp, #16 +; CHECK-TAIL: stp xzr, xzr, [sp, #-16]! call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9) Index: llvm/test/CodeGen/AArch64/ldst-pairing.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/ldst-pairing.ll @@ -0,0 +1,102 @@ +; RUN: llc < %s -march=arm64 -mtriple=aarch64-none-linux-gnu | FileCheck %s + +; CHECK: stp +; CHECK: stp +define void @st1(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> * %base, i64 %index) { +entry: + %a1 = getelementptr inbounds <4 x float>, <4 x float>* %base, i64 %index + %b1 = getelementptr <4 x float>, <4 x float>* %a1, i64 1 + %c1 = getelementptr <4 x float>, <4 x float>* %a1, i64 2 + %d1 = getelementptr <4 x float>, <4 x float>* %a1, i64 3 + + store <4 x float> %a, <4 x float> * %a1 + store <4 x float> %b, <4 x float> * %b1 + store <4 x float> %c, <4 x float> * %c1 + store <4 x float> %d, <4 x float> * %d1 + + ret void +} + +; CHECK: stp +; CHECK: stp +define void @st2(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* %base, i64 %index) { +entry: + %a0 = getelementptr inbounds float, float* %base, i64 %index + %b0 = getelementptr float, float* %a0, i64 4 + %c0 = getelementptr float, float* %a0, i64 8 + %d0 = getelementptr float, float* %a0, i64 12 + + %a1 = bitcast float* %a0 to <4 x float>* + %b1 = bitcast float* %b0 to <4 x float>* + %c1 = bitcast float* %c0 to <4 x float>* + %d1 = bitcast float* %d0 to <4 x float>* + + store <4 x float> %c, <4 x float> * %c1, align 4 + store <4 x float> %a, <4 x float> * %a1, align 4 + + ; This fadd forces the compiler to pair %c and %e after fadd, and leave the + ; stores %a and %b separated by a stp. The dependence analysis needs then to + ; prove that it is safe to move %b past the stp to be paired with %a. + %e = fadd fast <4 x float> %d, %a + + store <4 x float> %e, <4 x float> * %d1, align 4 + store <4 x float> %b, <4 x float> * %b1, align 4 + + ret void +} + +; CHECK: stp +; CHECK: stp +define void @st3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> * %base, i64 %index) { +entry: + %a1 = getelementptr inbounds <4 x float>, <4 x float>* %base, i64 %index + %b1 = getelementptr <4 x float>, <4 x float>* %a1, i64 1 + %c1 = getelementptr <4 x float>, <4 x float>* %a1, i64 2 + %d1 = getelementptr <4 x float>, <4 x float>* %a1, i64 3 + + store <4 x float> %a, <4 x float> * %a1 + store <4 x float> %d, <4 x float> * %d1 + store <4 x float> %c, <4 x float> * %c1 + store <4 x float> %b, <4 x float> * %b1 + + ret void +} + +; CHECK: stp +; CHECK: stp +define void @st4(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> * %base, i64 %index) { +entry: + %a1 = getelementptr inbounds <4 x float>, <4 x float>* %base, i64 %index + %b1 = getelementptr <4 x float>, <4 x float>* %a1, i64 1 + %c1 = getelementptr <4 x float>, <4 x float>* %a1, i64 2 + %d1 = getelementptr <4 x float>, <4 x float>* %a1, i64 3 + + store <4 x float> %d, <4 x float> * %d1 + store <4 x float> %b, <4 x float> * %b1 + + %e = fadd fast <4 x float> %c, %a + + store <4 x float> %e, <4 x float> * %c1 + store <4 x float> %a, <4 x float> * %a1 + + ret void +} + +; FIXME: st5 should contain two stp. The current pairing algorithm is greedy and +; is pairing %c and %b, leaving the two other stores %a and %d not pairable. + +; CHECK: stp +define void @st5(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> * %base, i64 %index) { +entry: + %a1 = getelementptr inbounds <4 x float>, <4 x float>* %base, i64 %index + %b1 = getelementptr <4 x float>, <4 x float>* %a1, i64 1 + %c1 = getelementptr <4 x float>, <4 x float>* %a1, i64 2 + %d1 = getelementptr <4 x float>, <4 x float>* %a1, i64 3 + + store <4 x float> %c, <4 x float> * %c1 + store <4 x float> %a, <4 x float> * %a1 + store <4 x float> %b, <4 x float> * %b1 + store <4 x float> %d, <4 x float> * %d1 + + ret void +}