Index: lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.h +++ lib/Target/AArch64/AArch64InstrInfo.h @@ -42,6 +42,9 @@ MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const; + void CopyPhysRegTuple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg) const; void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -134,7 +134,8 @@ return; } } else { - llvm_unreachable("Unknown register class in copyPhysReg"); + CopyPhysRegTuple(MBB, I, DL, DestReg, SrcReg); + return; } // E.g. ORR xDst, xzr, xSrc, lsl #0 @@ -144,6 +145,55 @@ .addImm(0); } +void AArch64InstrInfo::CopyPhysRegTuple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + DebugLoc DL, unsigned DestReg, + unsigned SrcReg) const { + unsigned SubRegs; + bool IsQRegs; + if (AArch64::DPairRegClass.contains(DestReg, SrcReg)) { + SubRegs = 2; + IsQRegs = false; + } else if (AArch64::DTripleRegClass.contains(DestReg, SrcReg)) { + SubRegs = 3; + IsQRegs = false; + } else if (AArch64::DQuadRegClass.contains(DestReg, SrcReg)) { + SubRegs = 4; + IsQRegs = false; + } else if (AArch64::QPairRegClass.contains(DestReg, SrcReg)) { + SubRegs = 2; + IsQRegs = true; + } else if (AArch64::QTripleRegClass.contains(DestReg, SrcReg)) { + SubRegs = 3; + IsQRegs = true; + } else if (AArch64::QQuadRegClass.contains(DestReg, SrcReg)) { + SubRegs = 4; + IsQRegs = true; + } else + llvm_unreachable("Unknown register class"); + + unsigned BeginIdx = IsQRegs ? AArch64::qsub_0 : AArch64::dsub_0; + int Spacing = 1; + const TargetRegisterInfo *TRI = &getRegisterInfo(); + // Copy register tuples backward when the first Dest reg overlaps + // with SrcReg. + if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) { + BeginIdx = BeginIdx + (SubRegs - 1); + Spacing = -1; + } + + unsigned Opc = IsQRegs ? AArch64::ORRvvv_16B : AArch64::ORRvvv_8B; + for (unsigned i = 0; i != SubRegs; ++i) { + unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing); + unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing); + assert(Dst && Src && "Bad sub-register"); + BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst) + .addReg(Src) + .addReg(Src); + } + return; +} + /// Does the Opcode represent a conditional branch that we can remove and re-add /// at the end of a basic block? static bool isCondBranch(unsigned Opc) { Index: test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define void @test_copyTuples(i32* %a, i32* %b) { +; CHECK-LABEL: test_copyTuples: +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}] +entry: + %0 = bitcast i32* %a to i8* + %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> , <2 x i32> , <2 x i32> , <2 x i32> , i32 0, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3 + %1 = bitcast i32* %b to i8* + tail call void @llvm.arm.neon.vst4.v2i32(i8* %1, <2 x i32> %vld3_lane.fca.0.extract, <2 x i32> %vld3_lane.fca.1.extract, <2 x i32> %vld3_lane.fca.2.extract, <2 x i32> %vld3_lane.fca.3.extract, i32 4) + %vld3_lane98 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %1, <2 x i32> , <2 x i32> , <2 x i32> , <2 x i32> , i32 1, i32 4) + %vld3_lane98.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane98, 0 + %vld3_lane98.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane98, 1 + %vld3_lane98.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane98, 2 + %vld3_lane98.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane98, 3 + tail call void @llvm.arm.neon.vst4.v2i32(i8* %0, <2 x i32> %vld3_lane98.fca.0.extract, <2 x i32> %vld3_lane98.fca.1.extract, <2 x i32> %vld3_lane98.fca.2.extract, <2 x i32> %vld3_lane98.fca.3.extract, i32 4) + ret void +} + +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) + +declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) \ No newline at end of file