diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -267,6 +267,109 @@ TOCSaves[MI] = Keep; } +// This function returns a list of all PHI nodes in the tree starting from +// the RootPHI node. We perform a BFS traversal to get an ordered list of nodes. +// The list initially only contains the root PHI. When we visit a PHI node, we +// add it to the list. We continue to look for other PHI node operands while +// there are nodes to visit in the list. The function returns false if the +// optimization cannot be applied on this tree. +static bool collectUnprimedAccPHIs(MachineRegisterInfo *MRI, + MachineInstr *RootPHI, + SmallVectorImpl &PHIs) { + PHIs.push_back(RootPHI); + unsigned VisitedIndex = 0; + while (VisitedIndex < PHIs.size()) { + MachineInstr *VisitedPHI = PHIs[VisitedIndex]; + for (unsigned PHIOp = 1, NumOps = VisitedPHI->getNumOperands(); + PHIOp != NumOps; PHIOp += 2) { + Register RegOp = VisitedPHI->getOperand(PHIOp).getReg(); + if (!Register::isVirtualRegister(RegOp)) + return false; + MachineInstr *Instr = MRI->getVRegDef(RegOp); + // While collecting the PHI nodes, we check if they can be converted (i.e. + // all the operands are either copies, implicit defs or PHI nodes). + unsigned Opcode = Instr->getOpcode(); + if (Opcode == PPC::COPY) { + Register Reg = Instr->getOperand(1).getReg(); + if (!Register::isVirtualRegister(Reg) || + MRI->getRegClass(Reg) != &PPC::ACCRCRegClass) + return false; + } else if (Opcode != PPC::IMPLICIT_DEF && Opcode != PPC::PHI) + return false; + // If we detect a cycle in the PHI nodes, we exit. It would be + // possible to change cycles as well, but that would add a lot + // of complexity for a case that is unlikely to occur with MMA + // code. + if (Opcode != PPC::PHI) + continue; + if (std::find(PHIs.begin(), PHIs.end(), Instr) != PHIs.end()) + return false; + PHIs.push_back(Instr); + } + VisitedIndex++; + } + return true; +} + +// This function changes the unprimed accumulator PHI nodes in the PHIs list to +// primed accumulator PHI nodes. The list is traversed in reverse order to +// change all the PHI operands of a PHI node before changing the node itself. +// We keep a map to associate each changed PHI node to its non-changed form. The +// function returns false if the optimization cannot be applied on these nodes. +static void convertUnprimedAccPHIs(const PPCInstrInfo *TII, + MachineRegisterInfo *MRI, + SmallVectorImpl &PHIs, + Register Dst) { + DenseMap ChangedPHIMap; + for (auto It = PHIs.rbegin(), End = PHIs.rend(); It != End; ++It) { + MachineInstr *PHI = *It; + SmallVector, 4> PHIOps; + // We check if the current PHI node can be changed by looking at its + // operands. If all the operands are either copies from primed + // accumulators, implicit definitions or other unprimed accumulator + // PHI nodes, we change it. + for (unsigned PHIOp = 1, NumOps = PHI->getNumOperands(); PHIOp != NumOps; + PHIOp += 2) { + Register RegOp = PHI->getOperand(PHIOp).getReg(); + MachineInstr *PHIInput = MRI->getVRegDef(RegOp); + unsigned Opcode = PHIInput->getOpcode(); + assert((Opcode == PPC::COPY || Opcode == PPC::IMPLICIT_DEF || + Opcode == PPC::PHI) && + "Unexpected instruction"); + if (Opcode == PPC::COPY) { + assert(MRI->getRegClass(PHIInput->getOperand(1).getReg()) == + &PPC::ACCRCRegClass && + "Unexpected register class"); + PHIOps.push_back({PHIInput->getOperand(1), PHI->getOperand(PHIOp + 1)}); + } else if (Opcode == PPC::IMPLICIT_DEF) { + Register AccReg = MRI->createVirtualRegister(&PPC::ACCRCRegClass); + BuildMI(*PHIInput->getParent(), PHIInput, PHIInput->getDebugLoc(), + TII->get(PPC::IMPLICIT_DEF), AccReg); + PHIOps.push_back({MachineOperand::CreateReg(AccReg, false), + PHI->getOperand(PHIOp + 1)}); + } else if (Opcode == PPC::PHI) { + // We found a PHI operand. At this point we know this operand + // has already been changed so we get its associated changed form + // from the map. + assert(ChangedPHIMap.count(PHIInput) == 1 && + "This PHI node should have already been changed."); + MachineInstr *PrimedAccPHI = ChangedPHIMap.lookup(PHIInput); + PHIOps.push_back({MachineOperand::CreateReg( + PrimedAccPHI->getOperand(0).getReg(), false), + PHI->getOperand(PHIOp + 1)}); + } + } + Register AccReg = Dst; + if (PHI != PHIs[0]) + AccReg = MRI->createVirtualRegister(&PPC::ACCRCRegClass); + MachineInstrBuilder NewPHI = BuildMI( + *PHI->getParent(), PHI, PHI->getDebugLoc(), TII->get(PPC::PHI), AccReg); + for (auto RegMBB : PHIOps) + NewPHI.add(RegMBB.first).add(RegMBB.second); + ChangedPHIMap[PHI] = NewPHI.getInstr(); + } +} + // Perform peephole optimizations. bool PPCMIPeephole::simplifyCode(void) { bool Simplified = false; @@ -321,6 +424,38 @@ default: break; + case PPC::COPY: { + Register Src = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + if (!Register::isVirtualRegister(Src) || + !Register::isVirtualRegister(Dst)) + break; + if (MRI->getRegClass(Src) != &PPC::UACCRCRegClass || + MRI->getRegClass(Dst) != &PPC::ACCRCRegClass) + break; + + // We are copying an unprimed accumulator to a primed accumulator. + // If the input to the copy is a PHI that is fed only by (i) copies in + // the other directions (ii) implicitly defined unprimed accumulators or + // (iii) other PHI nodes satisfying (i) (ii) and (iii), we can change + // the PHI to a PHI on primed accumulators (as long as we also change + // its operands). To detect and change such copies, we first get a list + // of all the PHI nodes starting from the root PHI node in BFS order. + // We then visit all these PHI nodes to check if they can be changed to + // primed accumulator PHI nodes and if so, we change them. + MachineInstr *RootPHI = MRI->getVRegDef(Src); + if (RootPHI->getOpcode() != PPC::PHI) + break; + + SmallVector PHIs; + if (!collectUnprimedAccPHIs(MRI, RootPHI, PHIs)) + break; + + convertUnprimedAccPHIs(TII, MRI, PHIs, Dst); + + ToErase = &MI; + break; + } case PPC::LI: case PPC::LI8: { // If we are materializing a zero, look for any use operands for which diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll @@ -262,8 +262,6 @@ ; CHECK-NEXT: xvi4ger8pp acc0, v2, v2 ; CHECK-NEXT: .LBB7_3: # %if.end ; CHECK-NEXT: xxmfacc acc0 -; CHECK-NEXT: xxmtacc acc0 -; CHECK-NEXT: xxmfacc acc0 ; CHECK-NEXT: stxv vs0, 48(r3) ; CHECK-NEXT: stxv vs1, 32(r3) ; CHECK-NEXT: stxv vs2, 16(r3) @@ -286,8 +284,6 @@ ; CHECK-BE-NEXT: xvi4ger8pp acc0, v2, v2 ; CHECK-BE-NEXT: .LBB7_3: # %if.end ; CHECK-BE-NEXT: xxmfacc acc0 -; CHECK-BE-NEXT: xxmtacc acc0 -; CHECK-BE-NEXT: xxmfacc acc0 ; CHECK-BE-NEXT: stxv vs1, 16(r3) ; CHECK-BE-NEXT: stxv vs0, 0(r3) ; CHECK-BE-NEXT: stxv vs3, 48(r3) diff --git a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll @@ -0,0 +1,346 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s +; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE + +declare <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8>, <16 x i8>) +declare <512 x i1> @llvm.ppc.mma.xxsetaccz() +declare <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1>, <256 x i1>, <16 x i8>) +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1>) +define void @testPHI1(<16 x i8>* %Dst, <16 x i8>* %Src, i32 signext %Len) { +; CHECK-LABEL: testPHI1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpwi r5, 3 +; CHECK-NEXT: xxsetaccz acc0 +; CHECK-NEXT: blt cr0, .LBB0_3 +; CHECK-NEXT: # %bb.1: # %for.body.preheader +; CHECK-NEXT: clrldi r6, r5, 32 +; CHECK-NEXT: addi r5, r4, 32 +; CHECK-NEXT: addi r6, r6, -2 +; CHECK-NEXT: lxv vs4, 0(r4) +; CHECK-NEXT: lxv vs5, 16(r4) +; CHECK-NEXT: mtctr r6 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_2: # %for.body +; CHECK-NEXT: # +; CHECK-NEXT: lxv vs6, 0(r5) +; CHECK-NEXT: addi r5, r5, 16 +; CHECK-NEXT: xvf64gerpp acc0, vsp4, vs6 +; CHECK-NEXT: # kill: def $uacc2 killed $acc0 +; CHECK-NEXT: bdnz .LBB0_2 +; CHECK-NEXT: .LBB0_3: # %for.cond.cleanup +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs3, 0(r3) +; CHECK-NEXT: stxv vs2, 16(r3) +; CHECK-NEXT: stxv vs1, 32(r3) +; CHECK-NEXT: stxv vs0, 48(r3) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testPHI1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: cmpwi r5, 3 +; CHECK-BE-NEXT: xxsetaccz acc0 +; CHECK-BE-NEXT: blt cr0, .LBB0_3 +; CHECK-BE-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-NEXT: clrldi r6, r5, 32 +; CHECK-BE-NEXT: addi r5, r4, 32 +; CHECK-BE-NEXT: addi r6, r6, -2 +; CHECK-BE-NEXT: lxv vs4, 0(r4) +; CHECK-BE-NEXT: lxv vs5, 16(r4) +; CHECK-BE-NEXT: mtctr r6 +; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: .LBB0_2: # %for.body +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: lxv vs6, 0(r5) +; CHECK-BE-NEXT: addi r5, r5, 16 +; CHECK-BE-NEXT: xvf64gerpp acc0, vsp4, vs6 +; CHECK-BE-NEXT: # kill: def $uacc2 killed $acc0 +; CHECK-BE-NEXT: bdnz .LBB0_2 +; CHECK-BE-NEXT: .LBB0_3: # %for.cond.cleanup +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs0, 0(r3) +; CHECK-BE-NEXT: stxv vs1, 16(r3) +; CHECK-BE-NEXT: stxv vs2, 32(r3) +; CHECK-BE-NEXT: stxv vs3, 48(r3) +; CHECK-BE-NEXT: blr +entry: + %0 = load <16 x i8>, <16 x i8>* %Src, align 16 + %arrayidx1 = getelementptr inbounds <16 x i8>, <16 x i8>* %Src, i64 1 + %1 = load <16 x i8>, <16 x i8>* %arrayidx1, align 16 + %2 = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> %0, <16 x i8> %1) + %3 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %cmp11 = icmp sgt i32 %Len, 2 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext i32 %Len to i64 + br label %for.body + +for.cond.cleanup: + %Acc.0.lcssa = phi <512 x i1> [ %3, %entry ], [ %13, %for.body ] + %4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %Acc.0.lcssa) + %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 0 + store <16 x i8> %5, <16 x i8>* %Dst, align 16 + %6 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 1 + %7 = getelementptr inbounds <16 x i8>, <16 x i8>* %Dst, i64 1 + store <16 x i8> %6, <16 x i8>* %7, align 16 + %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 2 + %9 = getelementptr inbounds <16 x i8>, <16 x i8>* %Dst, i64 2 + store <16 x i8> %8, <16 x i8>* %9, align 16 + %10 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 3 + %11 = getelementptr inbounds <16 x i8>, <16 x i8>* %Dst, i64 3 + store <16 x i8> %10, <16 x i8>* %11, align 16 + ret void + +for.body: + %indvars.iv = phi i64 [ 2, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %Acc.012 = phi <512 x i1> [ %3, %for.body.preheader ], [ %13, %for.body ] + %arrayidx2 = getelementptr inbounds <16 x i8>, <16 x i8>* %Src, i64 %indvars.iv + %12 = load <16 x i8>, <16 x i8>* %arrayidx2, align 16 + %13 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %Acc.012, <256 x i1> %2, <16 x i8> %12) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1>, <16 x i8>) +define dso_local void @testPHI2(<16 x i8>* %Dst, <16 x i8>* %Src, i32 signext %Len) { +; CHECK-LABEL: testPHI2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs4, 0(r4) +; CHECK-NEXT: lxv vs5, 16(r4) +; CHECK-NEXT: lxv vs6, 32(r4) +; CHECK-NEXT: cmpwi r5, 4 +; CHECK-NEXT: xvf64ger acc0, vsp4, vs6 +; CHECK-NEXT: blt cr0, .LBB1_3 +; CHECK-NEXT: # %bb.1: # %for.body.preheader +; CHECK-NEXT: addi r4, r4, 48 +; CHECK-NEXT: clrldi r5, r5, 32 +; CHECK-NEXT: addi r5, r5, -3 +; CHECK-NEXT: mtctr r5 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB1_2: # %for.body +; CHECK-NEXT: # +; CHECK-NEXT: lxv vs6, 0(r4) +; CHECK-NEXT: addi r4, r4, 16 +; CHECK-NEXT: xvf64gerpp acc0, vsp4, vs6 +; CHECK-NEXT: # kill: def $uacc2 killed $acc0 +; CHECK-NEXT: bdnz .LBB1_2 +; CHECK-NEXT: .LBB1_3: # %for.cond.cleanup +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs3, 0(r3) +; CHECK-NEXT: stxv vs2, 16(r3) +; CHECK-NEXT: stxv vs1, 32(r3) +; CHECK-NEXT: stxv vs0, 48(r3) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testPHI2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs4, 0(r4) +; CHECK-BE-NEXT: lxv vs5, 16(r4) +; CHECK-BE-NEXT: lxv vs6, 32(r4) +; CHECK-BE-NEXT: cmpwi r5, 4 +; CHECK-BE-NEXT: xvf64ger acc0, vsp4, vs6 +; CHECK-BE-NEXT: blt cr0, .LBB1_3 +; CHECK-BE-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-NEXT: addi r4, r4, 48 +; CHECK-BE-NEXT: clrldi r5, r5, 32 +; CHECK-BE-NEXT: addi r5, r5, -3 +; CHECK-BE-NEXT: mtctr r5 +; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: .LBB1_2: # %for.body +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: lxv vs6, 0(r4) +; CHECK-BE-NEXT: addi r4, r4, 16 +; CHECK-BE-NEXT: xvf64gerpp acc0, vsp4, vs6 +; CHECK-BE-NEXT: # kill: def $uacc2 killed $acc0 +; CHECK-BE-NEXT: bdnz .LBB1_2 +; CHECK-BE-NEXT: .LBB1_3: # %for.cond.cleanup +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs0, 0(r3) +; CHECK-BE-NEXT: stxv vs1, 16(r3) +; CHECK-BE-NEXT: stxv vs2, 32(r3) +; CHECK-BE-NEXT: stxv vs3, 48(r3) +; CHECK-BE-NEXT: blr +entry: + %0 = load <16 x i8>, <16 x i8>* %Src, align 16 + %arrayidx1 = getelementptr inbounds <16 x i8>, <16 x i8>* %Src, i64 1 + %1 = load <16 x i8>, <16 x i8>* %arrayidx1, align 16 + %2 = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> %0, <16 x i8> %1) + %arrayidx2 = getelementptr inbounds <16 x i8>, <16 x i8>* %Src, i64 2 + %3 = load <16 x i8>, <16 x i8>* %arrayidx2, align 16 + %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %2, <16 x i8> %3) + %cmp14 = icmp sgt i32 %Len, 3 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext i32 %Len to i64 + br label %for.body + +for.cond.cleanup: + %Acc.0.lcssa = phi <512 x i1> [ %4, %entry ], [ %14, %for.body ] + %5 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %Acc.0.lcssa) + %6 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 0 + store <16 x i8> %6, <16 x i8>* %Dst, align 16 + %7 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 1 + %8 = getelementptr inbounds <16 x i8>, <16 x i8>* %Dst, i64 1 + store <16 x i8> %7, <16 x i8>* %8, align 16 + %9 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 2 + %10 = getelementptr inbounds <16 x i8>, <16 x i8>* %Dst, i64 2 + store <16 x i8> %9, <16 x i8>* %10, align 16 + %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 3 + %12 = getelementptr inbounds <16 x i8>, <16 x i8>* %Dst, i64 3 + store <16 x i8> %11, <16 x i8>* %12, align 16 + ret void + +for.body: + %indvars.iv = phi i64 [ 3, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %Acc.015 = phi <512 x i1> [ %4, %for.body.preheader ], [ %14, %for.body ] + %arrayidx3 = getelementptr inbounds <16 x i8>, <16 x i8>* %Src, i64 %indvars.iv + %13 = load <16 x i8>, <16 x i8>* %arrayidx3, align 16 + %14 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %Acc.015, <256 x i1> %2, <16 x i8> %13) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; This test uses an unprimed accumulator PHI node with two operands: an +; implicitely defined unprimed accumulator and the unprimed result of the call +; to xvf64gerpp. The compiler should replace this PHI node by a primed +; accumulator PHI node. +define void @testImplicitDef(<16 x i8>* %ptr) { +; CHECK-LABEL: testImplicitDef: +; CHECK: # %bb.0: # %label1 +; CHECK-NEXT: # implicit-def: $acc0 +; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB2_2 +; CHECK-NEXT: # %bb.1: # %label2 +; CHECK-NEXT: xvf64gerpp acc0, vsp0, vs0 +; CHECK-NEXT: .LBB2_2: # %label3 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 0(r3) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testImplicitDef: +; CHECK-BE: # %bb.0: # %label1 +; CHECK-BE-NEXT: # implicit-def: $acc0 +; CHECK-BE-NEXT: bc 12, 4*cr5+lt, .LBB2_2 +; CHECK-BE-NEXT: # %bb.1: # %label2 +; CHECK-BE-NEXT: xvf64gerpp acc0, vsp0, vs0 +; CHECK-BE-NEXT: .LBB2_2: # %label3 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs3, 0(r3) +; CHECK-BE-NEXT: blr +label1: + br i1 undef, label %label3, label %label2 + +label2: + %0 = call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> undef, <256 x i1> undef, <16 x i8> undef) + br label %label3 + +label3: + %1 = phi <512 x i1> [ undef, %label1 ], [ %0, %label2 ] + %2 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %1) + %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %2, 3 + store <16 x i8> %3, <16 x i8>* %ptr, align 16 + ret void +} + +; This test uses an unprimed accumulator PHI node with an unprimed accumulator +; PHI node operand. The compiler should replace these PHI nodes by primed +; accumulator PHI nodes. +declare <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1>, <16 x i8>, <16 x i8>) +define dso_local signext i32 @testNestedPHI(i32 signext %cond, i32 signext %count, <512 x i1>* nocapture %ptr, <16 x i8> %vc) { +; CHECK-LABEL: testNestedPHI: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmplwi r3, 0 +; CHECK-NEXT: beq cr0, .LBB3_2 +; CHECK-NEXT: # %bb.1: # %if.then +; CHECK-NEXT: xvf32gernp acc0, v2, v2 +; CHECK-NEXT: cmpwi r4, 1 +; CHECK-NEXT: bge cr0, .LBB3_3 +; CHECK-NEXT: b .LBB3_5 +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: # implicit-def: $acc0 +; CHECK-NEXT: cmpwi r4, 1 +; CHECK-NEXT: blt cr0, .LBB3_5 +; CHECK-NEXT: .LBB3_3: # %for.body.preheader +; CHECK-NEXT: addi r3, r4, -1 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: mtctr r3 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB3_4: # %for.body +; CHECK-NEXT: # +; CHECK-NEXT: xvf32gernp acc0, v2, v2 +; CHECK-NEXT: # kill: def $uacc1 killed $acc0 +; CHECK-NEXT: bdnz .LBB3_4 +; CHECK-NEXT: .LBB3_5: # %for.cond.cleanup +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r5) +; CHECK-NEXT: stxv vs1, 32(r5) +; CHECK-NEXT: stxv vs2, 16(r5) +; CHECK-NEXT: stxv vs3, 0(r5) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testNestedPHI: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: cmplwi r3, 0 +; CHECK-BE-NEXT: beq cr0, .LBB3_2 +; CHECK-BE-NEXT: # %bb.1: # %if.then +; CHECK-BE-NEXT: xvf32gernp acc0, v2, v2 +; CHECK-BE-NEXT: cmpwi r4, 1 +; CHECK-BE-NEXT: bge cr0, .LBB3_3 +; CHECK-BE-NEXT: b .LBB3_5 +; CHECK-BE-NEXT: .LBB3_2: +; CHECK-BE-NEXT: # implicit-def: $acc0 +; CHECK-BE-NEXT: cmpwi r4, 1 +; CHECK-BE-NEXT: blt cr0, .LBB3_5 +; CHECK-BE-NEXT: .LBB3_3: # %for.body.preheader +; CHECK-BE-NEXT: addi r3, r4, -1 +; CHECK-BE-NEXT: clrldi r3, r3, 32 +; CHECK-BE-NEXT: addi r3, r3, 1 +; CHECK-BE-NEXT: mtctr r3 +; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: .LBB3_4: # %for.body +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: xvf32gernp acc0, v2, v2 +; CHECK-BE-NEXT: # kill: def $uacc1 killed $acc0 +; CHECK-BE-NEXT: bdnz .LBB3_4 +; CHECK-BE-NEXT: .LBB3_5: # %for.cond.cleanup +; CHECK-BE-NEXT: li r3, 0 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r5) +; CHECK-BE-NEXT: stxv vs0, 0(r5) +; CHECK-BE-NEXT: stxv vs3, 48(r5) +; CHECK-BE-NEXT: stxv vs2, 32(r5) +; CHECK-BE-NEXT: blr +entry: + %tobool.not = icmp eq i32 %cond, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: + %0 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> undef, <16 x i8> %vc, <16 x i8> %vc) + br label %if.end + +if.end: + %vq.0 = phi <512 x i1> [ %0, %if.then ], [ undef, %entry ] + %cmp9 = icmp sgt i32 %count, 0 + br i1 %cmp9, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + %vq.1.lcssa = phi <512 x i1> [ %vq.0, %if.end ], [ %1, %for.body ] + store <512 x i1> %vq.1.lcssa, <512 x i1>* %ptr, align 64 + ret i32 0 + +for.body: + %i.011 = phi i32 [ %inc, %for.body ], [ 0, %if.end ] + %vq.110 = phi <512 x i1> [ %1, %for.body ], [ %vq.0, %if.end ] + %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %vq.110, <16 x i8> %vc, <16 x i8> %vc) + %inc = add nuw nsw i32 %i.011, 1 + %exitcond.not = icmp eq i32 %inc, %count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/CodeGen/PowerPC/peephole-phi-acc.mir b/llvm/test/CodeGen/PowerPC/peephole-phi-acc.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/peephole-phi-acc.mir @@ -0,0 +1,839 @@ +# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 %s -o - \ +# RUN: -run-pass=ppc-mi-peepholes -verify-machineinstrs | FileCheck %s + +# Test the peephole replacing unprimed accumulator PHI nodes by primed +# accumulator PHI nodes. We have a test for the simple case (PHI nodes with COPY +# operands), a test for PHI nodes with IMPLICIT_DEF operands, a test for PHI +# nodes with operands being other PHI nodes on unprimed accumulators and a test +# with an unprimed accumulator PHI node cycle. + +--- | + ; ModuleID = 'test.ll' + source_filename = "test.c" + target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" + target triple = "powerpc64le-unknown-linux-gnu" + + ; Function Attrs: nofree nounwind writeonly + define dso_local void @phiCopy(i32 signext %i, <16 x i8> %vc, <512 x i1>* nocapture %ptr) local_unnamed_addr #0 { + entry: + %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %tobool.not = icmp eq i32 %i, 0 + br i1 %tobool.not, label %if.end, label %if.then + + if.then: ; preds = %entry + %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) + br label %if.end + + if.end: ; preds = %if.then, %entry + %vq.0 = phi <512 x i1> [ %1, %if.then ], [ %0, %entry ] + store <512 x i1> %vq.0, <512 x i1>* %ptr, align 64 + ret void + } + + ; Function Attrs: nounwind readnone + declare <512 x i1> @llvm.ppc.mma.xxsetaccz() #1 + + ; Function Attrs: nounwind readnone + declare <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1>, <16 x i8>, <16 x i8>) #1 + + ; Function Attrs: nofree nounwind writeonly + define dso_local void @phiCopyUndef(i32 signext %i, <16 x i8> %vc, <512 x i1>* nocapture %ptr) local_unnamed_addr #0 { + entry: + %tobool.not = icmp eq i32 %i, 0 + br i1 %tobool.not, label %if.end, label %if.then + + if.then: ; preds = %entry + %0 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> undef, <16 x i8> %vc, <16 x i8> %vc) + br label %if.end + + if.end: ; preds = %if.then, %entry + %vq.0 = phi <512 x i1> [ %0, %if.then ], [ undef, %entry ] + store <512 x i1> %vq.0, <512 x i1>* %ptr, align 64 + ret void + } + + ; Function Attrs: nofree nounwind writeonly + define dso_local void @phiPhis(i32 signext %i, <16 x i8> %vc, <512 x i1>* nocapture %ptr) local_unnamed_addr #0 { + entry: + %cmp6 = icmp sgt i32 %i, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + %0 = add i32 %i, -1 + %xtraiter = and i32 %i, 7 + %1 = icmp ult i32 %0, 7 + br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new + + for.body.preheader.new: ; preds = %for.body.preheader + %unroll_iter = and i32 %i, -8 + %2 = add i32 %unroll_iter, -8 + %3 = zext i32 %2 to i64 + %4 = lshr i64 %3, 3 + %5 = add nuw nsw i64 %4, 1 + call void @llvm.set.loop.iterations.i64(i64 %5) + br label %for.body + + for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader + %vq.07.unr = phi <512 x i1> [ undef, %for.body.preheader ], [ %18, %for.body ] + %lcmp.mod.not = icmp eq i32 %xtraiter, 0 + br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil.preheader + + for.body.epil.preheader: ; preds = %for.cond.cleanup.loopexit.unr-lcssa + %6 = add nsw i32 %xtraiter, -1 + %7 = zext i32 %6 to i64 + %8 = add nuw nsw i64 %7, 1 + call void @llvm.set.loop.iterations.i64(i64 %8) + br label %for.body.epil + + for.body.epil: ; preds = %for.body.epil.preheader, %for.body.epil + %vq.07.epil = phi <512 x i1> [ %9, %for.body.epil ], [ %vq.07.unr, %for.body.epil.preheader ] + %9 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %vq.07.epil, <16 x i8> %vc, <16 x i8> %vc) + %10 = call i1 @llvm.loop.decrement.i64(i64 1) + br i1 %10, label %for.body.epil, label %for.cond.cleanup + + for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry + %vq.0.lcssa = phi <512 x i1> [ undef, %entry ], [ %vq.07.unr, %for.cond.cleanup.loopexit.unr-lcssa ], [ %9, %for.body.epil ] + %add.ptr = getelementptr inbounds <512 x i1>, <512 x i1>* %ptr, i64 1 + store <512 x i1> %vq.0.lcssa, <512 x i1>* %add.ptr, align 64 + ret void + + for.body: ; preds = %for.body, %for.body.preheader.new + %vq.07 = phi <512 x i1> [ undef, %for.body.preheader.new ], [ %18, %for.body ] + %11 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %vq.07, <16 x i8> %vc, <16 x i8> %vc) + %12 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %11, <16 x i8> %vc, <16 x i8> %vc) + %13 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %12, <16 x i8> %vc, <16 x i8> %vc) + %14 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %13, <16 x i8> %vc, <16 x i8> %vc) + %15 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %14, <16 x i8> %vc, <16 x i8> %vc) + %16 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %15, <16 x i8> %vc, <16 x i8> %vc) + %17 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %16, <16 x i8> %vc, <16 x i8> %vc) + %18 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %17, <16 x i8> %vc, <16 x i8> %vc) + %19 = call i1 @llvm.loop.decrement.i64(i64 1) + br i1 %19, label %for.body, label %for.cond.cleanup.loopexit.unr-lcssa + } + + ; Function Attrs: nofree nounwind writeonly + define dso_local void @phiCycle(i32 signext %i, <16 x i8> %vc, <512 x i1>* nocapture %ptr) local_unnamed_addr #0 { + entry: + %cmp6 = icmp sgt i32 %i, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + %0 = add i32 %i, -1 + %xtraiter = and i32 %i, 7 + %1 = icmp ult i32 %0, 7 + br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new + + for.body.preheader.new: ; preds = %for.body.preheader + %unroll_iter = and i32 %i, -8 + %2 = add i32 %unroll_iter, -8 + %3 = zext i32 %2 to i64 + %4 = lshr i64 %3, 3 + %5 = add nuw nsw i64 %4, 1 + call void @llvm.set.loop.iterations.i64(i64 %5) + br label %for.body + + for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader + %vq.07.unr = phi <512 x i1> [ undef, %for.body.preheader ], [ %18, %for.body ], [ %vq.07.epil, %for.body.epil ] + %lcmp.mod.not = icmp eq i32 %xtraiter, 0 + br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil.preheader + + for.body.epil.preheader: ; preds = %for.cond.cleanup.loopexit.unr-lcssa + %6 = add nsw i32 %xtraiter, -1 + %7 = zext i32 %6 to i64 + %8 = add nuw nsw i64 %7, 1 + call void @llvm.set.loop.iterations.i64(i64 %8) + br label %for.body.epil + + for.body.epil: ; preds = %for.body.epil.preheader, %for.body.epil + %vq.07.epil = phi <512 x i1> [ %9, %for.body.epil ], [ %vq.07.unr, %for.body.epil.preheader ] + %9 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %vq.07.epil, <16 x i8> %vc, <16 x i8> %vc) + %10 = call i1 @llvm.loop.decrement.i64(i64 1) + %test = icmp ult i32 %0, 7 + br i1 %test, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.epil + ;br i1 %10, label %for.body.epil, label %for.cond.cleanup + + for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry + %vq.0.lcssa = phi <512 x i1> [ undef, %entry ], [ %vq.07.unr, %for.cond.cleanup.loopexit.unr-lcssa ] + %add.ptr = getelementptr inbounds <512 x i1>, <512 x i1>* %ptr, i64 1 + store <512 x i1> %vq.0.lcssa, <512 x i1>* %add.ptr, align 64 + ret void + + for.body: ; preds = %for.body, %for.body.preheader.new + %vq.07 = phi <512 x i1> [ undef, %for.body.preheader.new ], [ %18, %for.body ] + %11 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %vq.07, <16 x i8> %vc, <16 x i8> %vc) + %12 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %11, <16 x i8> %vc, <16 x i8> %vc) + %13 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %12, <16 x i8> %vc, <16 x i8> %vc) + %14 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %13, <16 x i8> %vc, <16 x i8> %vc) + %15 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %14, <16 x i8> %vc, <16 x i8> %vc) + %16 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %15, <16 x i8> %vc, <16 x i8> %vc) + %17 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %16, <16 x i8> %vc, <16 x i8> %vc) + %18 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %17, <16 x i8> %vc, <16 x i8> %vc) + %19 = call i1 @llvm.loop.decrement.i64(i64 1) + br i1 %19, label %for.body, label %for.cond.cleanup.loopexit.unr-lcssa + } + + ; Function Attrs: noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i64(i64) #2 + + ; Function Attrs: noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i64(i64) #2 + + attributes #0 = { nofree nounwind writeonly "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pwr10" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+mma,+paired-vector-memops,+pcrelative-memops,+power10-vector,+power8-vector,+power9-vector,+vsx,-htm,-spe" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { nounwind readnone "target-cpu"="pwr10" } + attributes #2 = { noduplicate nofree nosync nounwind willreturn } + +... +--- +name: phiCopy +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: + - { id: 0, class: uaccrc, preferred-register: '' } + - { id: 1, class: uaccrc, preferred-register: '' } + - { id: 2, class: uaccrc, preferred-register: '' } + - { id: 3, class: g8rc, preferred-register: '' } + - { id: 4, class: vrrc, preferred-register: '' } + - { id: 5, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 6, class: gprc, preferred-register: '' } + - { id: 7, class: accrc, preferred-register: '' } + - { id: 8, class: crrc, preferred-register: '' } + - { id: 9, class: vsrc, preferred-register: '' } + - { id: 10, class: accrc, preferred-register: '' } + - { id: 11, class: accrc, preferred-register: '' } + - { id: 12, class: accrc, preferred-register: '' } + - { id: 13, class: accrc, preferred-register: '' } + - { id: 14, class: vsrc, preferred-register: '' } + - { id: 15, class: vsrprc, preferred-register: '' } + - { id: 16, class: vsrprc, preferred-register: '' } + - { id: 17, class: vsrc, preferred-register: '' } + - { id: 18, class: vsrprc, preferred-register: '' } + - { id: 19, class: vsrprc, preferred-register: '' } + - { id: 20, class: vsrc, preferred-register: '' } + - { id: 21, class: vsrprc, preferred-register: '' } + - { id: 22, class: vsrprc, preferred-register: '' } + - { id: 23, class: vsrc, preferred-register: '' } + - { id: 24, class: vsrprc, preferred-register: '' } +liveins: + - { reg: '$x3', virtual-reg: '%3' } + - { reg: '$v2', virtual-reg: '%4' } + - { reg: '$x7', virtual-reg: '%5' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.2(0x30000000), %bb.1(0x50000000) + liveins: $x3, $v2, $x7 + + %5:g8rc_and_g8rc_nox0 = COPY $x7 + %4:vrrc = COPY $v2 + %3:g8rc = COPY $x3 + %6:gprc = COPY %3.sub_32 + %7:accrc = XXSETACCZ + %0:uaccrc = COPY %7 + %8:crrc = CMPLWI killed %6, 0 + BCC 76, killed %8, %bb.2 + B %bb.1 + + bb.1.if.then: + successors: %bb.2(0x80000000) + + %9:vsrc = COPY %4 + %11:accrc = COPY %7 + %10:accrc = XVF32GERPP %11, %9, %9 + %1:uaccrc = COPY %10 + + bb.2.if.end: + ; We check the PHI node on primed accumulator is inserted after the label + ; CHECK-LABEL: name: phiCopy + ; CHECK-LABEL: bb.{{[0-9]}}.if.end: + ; CHECK-NEXT: :accrc = PHI %7, %bb.0, %10, %bb.1 + ; CHECK-NEXT: %2:uaccrc = PHI + %2:uaccrc = PHI %0, %bb.0, %1, %bb.1 + %13:accrc = COPY %2 + %12:accrc = XXMFACC %13 + %14:vsrc = COPY %12.sub_vsx1 + %16:vsrprc = IMPLICIT_DEF + %15:vsrprc = INSERT_SUBREG %16, killed %14, %subreg.sub_vsx1 + %17:vsrc = COPY %12.sub_vsx0 + %18:vsrprc = INSERT_SUBREG %15, killed %17, %subreg.sub_vsx0 + STXVP killed %18, 32, %5 :: (store 32 into %ir.ptr + 32) + %19:vsrprc = COPY %12.sub_pair1 + %20:vsrc = COPY %19.sub_vsx1 + %22:vsrprc = IMPLICIT_DEF + %21:vsrprc = INSERT_SUBREG %22, killed %20, %subreg.sub_vsx1 + %23:vsrc = COPY %19.sub_vsx0 + %24:vsrprc = INSERT_SUBREG %21, killed %23, %subreg.sub_vsx0 + STXVP killed %24, 0, %5 :: (store 32 into %ir.ptr, align 64) + BLR8 implicit $lr8, implicit $rm + +... +--- +name: phiCopyUndef +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: + - { id: 0, class: uaccrc, preferred-register: '' } + - { id: 1, class: uaccrc, preferred-register: '' } + - { id: 2, class: g8rc, preferred-register: '' } + - { id: 3, class: vrrc, preferred-register: '' } + - { id: 4, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 5, class: uaccrc, preferred-register: '' } + - { id: 6, class: gprc, preferred-register: '' } + - { id: 7, class: crrc, preferred-register: '' } + - { id: 8, class: vsrc, preferred-register: '' } + - { id: 9, class: accrc, preferred-register: '' } + - { id: 10, class: uaccrc, preferred-register: '' } + - { id: 11, class: accrc, preferred-register: '' } + - { id: 12, class: accrc, preferred-register: '' } + - { id: 13, class: accrc, preferred-register: '' } + - { id: 14, class: vsrc, preferred-register: '' } + - { id: 15, class: vsrprc, preferred-register: '' } + - { id: 16, class: vsrprc, preferred-register: '' } + - { id: 17, class: vsrc, preferred-register: '' } + - { id: 18, class: vsrprc, preferred-register: '' } + - { id: 19, class: vsrprc, preferred-register: '' } + - { id: 20, class: vsrc, preferred-register: '' } + - { id: 21, class: vsrprc, preferred-register: '' } + - { id: 22, class: vsrprc, preferred-register: '' } + - { id: 23, class: vsrc, preferred-register: '' } + - { id: 24, class: vsrprc, preferred-register: '' } +liveins: + - { reg: '$x3', virtual-reg: '%2' } + - { reg: '$v2', virtual-reg: '%3' } + - { reg: '$x7', virtual-reg: '%4' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.3(0x30000000), %bb.1(0x50000000) + liveins: $x3, $v2, $x7 + + %4:g8rc_and_g8rc_nox0 = COPY $x7 + %3:vrrc = COPY $v2 + %2:g8rc = COPY $x3 + %6:gprc = COPY %2.sub_32 + %7:crrc = CMPLWI killed %6, 0 + BCC 68, killed %7, %bb.1 + + bb.3: + successors: %bb.2(0x80000000) + + %5:uaccrc = IMPLICIT_DEF + B %bb.2 + + bb.1.if.then: + successors: %bb.2(0x80000000) + + %8:vsrc = COPY %3 + %10:uaccrc = IMPLICIT_DEF + %11:accrc = COPY %10 + %9:accrc = XVF32GERPP %11, %8, %8 + %0:uaccrc = COPY %9 + + bb.2.if.end: + ; We check the PHI node on primed accumulator is inserted after the label + ; CHECK-LABEL: name: phiCopyUndef + ; CHECK-LABEL: bb.{{[0-9]}}.if.end: + ; CHECK-NEXT: :accrc = PHI + ; CHECK-NEXT: %1:uaccrc = PHI + %1:uaccrc = PHI %5, %bb.3, %0, %bb.1 + %13:accrc = COPY %1 + %12:accrc = XXMFACC %13 + %14:vsrc = COPY %12.sub_vsx1 + %16:vsrprc = IMPLICIT_DEF + %15:vsrprc = INSERT_SUBREG %16, killed %14, %subreg.sub_vsx1 + %17:vsrc = COPY %12.sub_vsx0 + %18:vsrprc = INSERT_SUBREG %15, killed %17, %subreg.sub_vsx0 + STXVP killed %18, 32, %4 :: (store 32 into %ir.ptr + 32) + %19:vsrprc = COPY %12.sub_pair1 + %20:vsrc = COPY %19.sub_vsx1 + %22:vsrprc = IMPLICIT_DEF + %21:vsrprc = INSERT_SUBREG %22, killed %20, %subreg.sub_vsx1 + %23:vsrc = COPY %19.sub_vsx0 + %24:vsrprc = INSERT_SUBREG %21, killed %23, %subreg.sub_vsx0 + STXVP killed %24, 0, %4 :: (store 32 into %ir.ptr, align 64) + BLR8 implicit $lr8, implicit $rm + +... +--- +name: phiPhis +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: + - { id: 0, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 1, class: uaccrc, preferred-register: '' } + - { id: 2, class: uaccrc, preferred-register: '' } + - { id: 3, class: uaccrc, preferred-register: '' } + - { id: 4, class: uaccrc, preferred-register: '' } + - { id: 5, class: uaccrc, preferred-register: '' } + - { id: 6, class: uaccrc, preferred-register: '' } + - { id: 7, class: g8rc, preferred-register: '' } + - { id: 8, class: vrrc, preferred-register: '' } + - { id: 9, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 10, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 11, class: uaccrc, preferred-register: '' } + - { id: 12, class: crrc, preferred-register: '' } + - { id: 13, class: uaccrc, preferred-register: '' } + - { id: 14, class: gprc, preferred-register: '' } + - { id: 15, class: crrc, preferred-register: '' } + - { id: 16, class: uaccrc, preferred-register: '' } + - { id: 17, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 18, class: gprc, preferred-register: '' } + - { id: 19, class: g8rc, preferred-register: '' } + - { id: 20, class: g8rc, preferred-register: '' } + - { id: 21, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 22, class: g8rc, preferred-register: '' } + - { id: 23, class: vsrc, preferred-register: '' } + - { id: 24, class: accrc, preferred-register: '' } + - { id: 25, class: accrc, preferred-register: '' } + - { id: 26, class: accrc, preferred-register: '' } + - { id: 27, class: accrc, preferred-register: '' } + - { id: 28, class: accrc, preferred-register: '' } + - { id: 29, class: accrc, preferred-register: '' } + - { id: 30, class: accrc, preferred-register: '' } + - { id: 31, class: accrc, preferred-register: '' } + - { id: 32, class: accrc, preferred-register: '' } + - { id: 33, class: crrc, preferred-register: '' } + - { id: 34, class: gprc, preferred-register: '' } + - { id: 35, class: g8rc, preferred-register: '' } + - { id: 36, class: g8rc, preferred-register: '' } + - { id: 37, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 38, class: g8rc, preferred-register: '' } + - { id: 39, class: vsrc, preferred-register: '' } + - { id: 40, class: accrc, preferred-register: '' } + - { id: 41, class: accrc, preferred-register: '' } + - { id: 42, class: accrc, preferred-register: '' } + - { id: 43, class: accrc, preferred-register: '' } + - { id: 44, class: vsrc, preferred-register: '' } + - { id: 45, class: vsrprc, preferred-register: '' } + - { id: 46, class: vsrprc, preferred-register: '' } + - { id: 47, class: vsrc, preferred-register: '' } + - { id: 48, class: vsrprc, preferred-register: '' } + - { id: 49, class: vsrprc, preferred-register: '' } + - { id: 50, class: vsrc, preferred-register: '' } + - { id: 51, class: vsrprc, preferred-register: '' } + - { id: 52, class: vsrprc, preferred-register: '' } + - { id: 53, class: vsrc, preferred-register: '' } + - { id: 54, class: vsrprc, preferred-register: '' } +liveins: + - { reg: '$x3', virtual-reg: '%7' } + - { reg: '$v2', virtual-reg: '%8' } + - { reg: '$x7', virtual-reg: '%9' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x50000000), %bb.8(0x30000000) + liveins: $x3, $v2, $x7 + + %9:g8rc_and_g8rc_nox0 = COPY $x7 + %8:vrrc = COPY $v2 + %7:g8rc = COPY $x3 + %10:gprc_and_gprc_nor0 = COPY %7.sub_32 + %12:crrc = CMPWI %10, 1 + BCC 4, killed %12, %bb.1 + + bb.8: + successors: %bb.6(0x80000000) + + %11:uaccrc = IMPLICIT_DEF + B %bb.6 + + bb.1.for.body.preheader: + successors: %bb.3(0x40000000), %bb.2(0x40000000) + + %14:gprc = ADDI %10, -1 + %0:gprc_and_gprc_nor0 = RLWINM %10, 0, 29, 31 + %13:uaccrc = IMPLICIT_DEF + %15:crrc = CMPLWI killed %14, 7 + BCC 12, killed %15, %bb.3 + B %bb.2 + + bb.2.for.body.preheader.new: + successors: %bb.7(0x80000000) + + %17:gprc_and_gprc_nor0 = RLWINM %10, 0, 0, 28 + %18:gprc = ADDI killed %17, -8 + %20:g8rc = IMPLICIT_DEF + %19:g8rc = INSERT_SUBREG %20, killed %18, %subreg.sub_32 + %21:g8rc_and_g8rc_nox0 = RLWINM8 %19, 29, 3, 31 + %22:g8rc = nuw nsw ADDI8 killed %21, 1 + MTCTR8loop killed %22, implicit-def dead $ctr8 + %16:uaccrc = IMPLICIT_DEF + B %bb.7 + + bb.3.for.cond.cleanup.loopexit.unr-lcssa: + successors: %bb.6(0x30000000), %bb.4(0x50000000) + + %1:uaccrc = PHI %13, %bb.1, %6, %bb.7 + %33:crrc = CMPLWI %0, 0 + BCC 76, killed %33, %bb.6 + B %bb.4 + + bb.4.for.body.epil.preheader: + successors: %bb.5(0x80000000) + + %34:gprc = nsw ADDI %0, -1 + %36:g8rc = IMPLICIT_DEF + %35:g8rc = INSERT_SUBREG %36, killed %34, %subreg.sub_32 + %37:g8rc_and_g8rc_nox0 = RLDICL killed %35, 0, 32 + %38:g8rc = nuw nsw ADDI8 killed %37, 1 + MTCTR8loop killed %38, implicit-def dead $ctr8 + + bb.5.for.body.epil: + successors: %bb.5(0x7c000000), %bb.6(0x04000000) + ; We check the PHI node on primed accumulator is inserted after the label + ; CHECK-LABEL: name: phiPhis + ; CHECK-LABEL: bb.{{[0-9]}}.for.body.epil: + ; CHECK-NEXT: successors: %bb.{{[0-9]}}(0x{{[0-9a-f]+}}), %bb.{{[0-9]}}(0x{{[0-9a-f]+}}) + ; CHECK-NEXT: {{ }} + ; CHECK-NEXT: :accrc = PHI + ; CHECK-NEXT: %2:uaccrc = PHI + %2:uaccrc = PHI %1, %bb.4, %3, %bb.5 + %39:vsrc = COPY %8 + %41:accrc = COPY %2 + %40:accrc = XVF32GERPP %41, %39, %39 + %3:uaccrc = COPY %40 + BDNZ8 %bb.5, implicit-def dead $ctr8, implicit $ctr8 + B %bb.6 + + bb.6.for.cond.cleanup: + %4:uaccrc = PHI %11, %bb.8, %1, %bb.3, %3, %bb.5 + %43:accrc = COPY %4 + %42:accrc = XXMFACC %43 + %44:vsrc = COPY %42.sub_vsx1 + %46:vsrprc = IMPLICIT_DEF + %45:vsrprc = INSERT_SUBREG %46, killed %44, %subreg.sub_vsx1 + %47:vsrc = COPY %42.sub_vsx0 + %48:vsrprc = INSERT_SUBREG %45, killed %47, %subreg.sub_vsx0 + STXVP killed %48, 96, %9 :: (store 32 into %ir.add.ptr + 32) + %49:vsrprc = COPY %42.sub_pair1 + %50:vsrc = COPY %49.sub_vsx1 + %52:vsrprc = IMPLICIT_DEF + %51:vsrprc = INSERT_SUBREG %52, killed %50, %subreg.sub_vsx1 + %53:vsrc = COPY %49.sub_vsx0 + %54:vsrprc = INSERT_SUBREG %51, killed %53, %subreg.sub_vsx0 + STXVP killed %54, 64, %9 :: (store 32 into %ir.add.ptr, align 64) + BLR8 implicit $lr8, implicit $rm + + bb.7.for.body: + successors: %bb.7(0x7c000000), %bb.3(0x04000000) + + %5:uaccrc = PHI %16, %bb.2, %6, %bb.7 + %23:vsrc = COPY %8 + %25:accrc = COPY %5 + %24:accrc = XVF32GERPP %25, %23, %23 + %26:accrc = XVF32GERPP %24, %23, %23 + %27:accrc = XVF32GERPP %26, %23, %23 + %28:accrc = XVF32GERPP %27, %23, %23 + %29:accrc = XVF32GERPP %28, %23, %23 + %30:accrc = XVF32GERPP %29, %23, %23 + %31:accrc = XVF32GERPP %30, %23, %23 + %32:accrc = XVF32GERPP %31, %23, %23 + %6:uaccrc = COPY %32 + BDNZ8 %bb.7, implicit-def dead $ctr8, implicit $ctr8 + B %bb.3 + +... +--- +name: phiCycle +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: + - { id: 0, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 1, class: uaccrc, preferred-register: '' } + - { id: 2, class: uaccrc, preferred-register: '' } + - { id: 3, class: uaccrc, preferred-register: '' } + - { id: 4, class: uaccrc, preferred-register: '' } + - { id: 5, class: uaccrc, preferred-register: '' } + - { id: 6, class: uaccrc, preferred-register: '' } + - { id: 7, class: g8rc, preferred-register: '' } + - { id: 8, class: vrrc, preferred-register: '' } + - { id: 9, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 10, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 11, class: uaccrc, preferred-register: '' } + - { id: 12, class: crrc, preferred-register: '' } + - { id: 13, class: uaccrc, preferred-register: '' } + - { id: 14, class: gprc, preferred-register: '' } + - { id: 15, class: crrc, preferred-register: '' } + - { id: 16, class: uaccrc, preferred-register: '' } + - { id: 17, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 18, class: gprc, preferred-register: '' } + - { id: 19, class: g8rc, preferred-register: '' } + - { id: 20, class: g8rc, preferred-register: '' } + - { id: 21, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 22, class: g8rc, preferred-register: '' } + - { id: 23, class: vsrc, preferred-register: '' } + - { id: 24, class: accrc, preferred-register: '' } + - { id: 25, class: accrc, preferred-register: '' } + - { id: 26, class: accrc, preferred-register: '' } + - { id: 27, class: accrc, preferred-register: '' } + - { id: 28, class: accrc, preferred-register: '' } + - { id: 29, class: accrc, preferred-register: '' } + - { id: 30, class: accrc, preferred-register: '' } + - { id: 31, class: accrc, preferred-register: '' } + - { id: 32, class: accrc, preferred-register: '' } + - { id: 33, class: crrc, preferred-register: '' } + - { id: 34, class: gprc, preferred-register: '' } + - { id: 35, class: g8rc, preferred-register: '' } + - { id: 36, class: g8rc, preferred-register: '' } + - { id: 37, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 38, class: g8rc, preferred-register: '' } + - { id: 39, class: vsrc, preferred-register: '' } + - { id: 40, class: accrc, preferred-register: '' } + - { id: 41, class: accrc, preferred-register: '' } + - { id: 42, class: accrc, preferred-register: '' } + - { id: 43, class: accrc, preferred-register: '' } + - { id: 44, class: vsrc, preferred-register: '' } + - { id: 45, class: vsrprc, preferred-register: '' } + - { id: 46, class: vsrprc, preferred-register: '' } + - { id: 47, class: vsrc, preferred-register: '' } + - { id: 48, class: vsrprc, preferred-register: '' } + - { id: 49, class: vsrprc, preferred-register: '' } + - { id: 50, class: vsrc, preferred-register: '' } + - { id: 51, class: vsrprc, preferred-register: '' } + - { id: 52, class: vsrprc, preferred-register: '' } + - { id: 53, class: vsrc, preferred-register: '' } + - { id: 54, class: vsrprc, preferred-register: '' } +liveins: + - { reg: '$x3', virtual-reg: '%7' } + - { reg: '$v2', virtual-reg: '%8' } + - { reg: '$x7', virtual-reg: '%9' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x50000000), %bb.8(0x30000000) + liveins: $x3, $v2, $x7 + + %9:g8rc_and_g8rc_nox0 = COPY $x7 + %8:vrrc = COPY $v2 + %7:g8rc = COPY $x3 + %10:gprc_and_gprc_nor0 = COPY %7.sub_32 + %12:crrc = CMPWI %10, 1 + BCC 4, killed %12, %bb.1 + + bb.8: + successors: %bb.6(0x80000000) + + %11:uaccrc = IMPLICIT_DEF + B %bb.6 + + bb.1.for.body.preheader: + successors: %bb.3(0x40000000), %bb.2(0x40000000) + + %14:gprc = ADDI %10, -1 + %0:gprc_and_gprc_nor0 = RLWINM %10, 0, 29, 31 + %13:uaccrc = IMPLICIT_DEF + %15:crrc = CMPLWI %14, 7 + BCC 12, killed %15, %bb.3 + B %bb.2 + + bb.2.for.body.preheader.new: + successors: %bb.7(0x80000000) + + %17:gprc_and_gprc_nor0 = RLWINM %10, 0, 0, 28 + %18:gprc = ADDI killed %17, -8 + %20:g8rc = IMPLICIT_DEF + %19:g8rc = INSERT_SUBREG %20, killed %18, %subreg.sub_32 + %21:g8rc_and_g8rc_nox0 = RLWINM8 %19, 29, 3, 31 + %22:g8rc = nuw nsw ADDI8 killed %21, 1 + MTCTR8loop killed %22, implicit-def dead $ctr8 + %16:uaccrc = IMPLICIT_DEF + B %bb.7 + + bb.3.for.cond.cleanup.loopexit.unr-lcssa: + successors: %bb.6(0x30000000), %bb.4(0x50000000) + ; We check no phi node is inserted in the block + ; CHECK-LABEL: name: phiCycle + ; CHECK-LABEL: bb.{{[0-9]}}.for.cond.cleanup.loopexit.unr-lcssa: + ; CHECK-NEXT: successors: %bb.{{[0-9]}}(0x{{[0-9a-f]+}}), %bb.{{[0-9]}}(0x{{[0-9a-f]+}}) + ; CHECK-NEXT: {{ }} + ; CHECK-NEXT: %1:uaccrc = PHI + ; CHECK-NEXT: %33:crrc + %1:uaccrc = PHI %13, %bb.1, %6, %bb.7, %2, %bb.5 + %33:crrc = CMPLWI %0, 0 + BCC 76, killed %33, %bb.6 + B %bb.4 + + bb.4.for.body.epil.preheader: + successors: %bb.5(0x80000000) + + %34:gprc = nsw ADDI %0, -1 + %36:g8rc = IMPLICIT_DEF + %35:g8rc = INSERT_SUBREG %36, killed %34, %subreg.sub_32 + %37:g8rc_and_g8rc_nox0 = RLDICL killed %35, 0, 32 + %38:g8rc = nuw nsw ADDI8 killed %37, 1 + MTCTR8loop killed %38, implicit-def dead $ctr8 + + bb.5.for.body.epil: + successors: %bb.3(0x40000000), %bb.5(0x7c000000) + ; We check no phi node is inserted in the block + ; CHECK-LABEL: bb.{{[0-9]}}.for.body.epil: + ; CHECK-NEXT: successors: %bb.{{[0-9]}}(0x{{[0-9a-f]+}}), %bb.{{[0-9]}}(0x{{[0-9a-f]+}}) + ; CHECK-NEXT: {{ }} + ; CHECK-NEXT: %2:uaccrc = PHI + ; CHECK-NEXT: %39:vsrc + %2:uaccrc = PHI %1, %bb.4, %3, %bb.5 + %39:vsrc = COPY %8 + %41:accrc = COPY %2 + %40:accrc = XVF32GERPP %41, %39, %39 + %3:uaccrc = COPY %40 + %15:crrc = CMPLWI %14, 7 + BCC 12, killed %15, %bb.5 + B %bb.3 + + bb.6.for.cond.cleanup: + %4:uaccrc = PHI %11, %bb.8, %1, %bb.3 + %43:accrc = COPY %4 + %42:accrc = XXMFACC %43 + %44:vsrc = COPY %42.sub_vsx1 + %46:vsrprc = IMPLICIT_DEF + %45:vsrprc = INSERT_SUBREG %46, killed %44, %subreg.sub_vsx1 + %47:vsrc = COPY %42.sub_vsx0 + %48:vsrprc = INSERT_SUBREG %45, killed %47, %subreg.sub_vsx0 + STXVP killed %48, 96, %9 :: (store 32 into %ir.add.ptr + 32) + %49:vsrprc = COPY %42.sub_pair1 + %50:vsrc = COPY %49.sub_vsx1 + %52:vsrprc = IMPLICIT_DEF + %51:vsrprc = INSERT_SUBREG %52, killed %50, %subreg.sub_vsx1 + %53:vsrc = COPY %49.sub_vsx0 + %54:vsrprc = INSERT_SUBREG %51, killed %53, %subreg.sub_vsx0 + STXVP killed %54, 64, %9 :: (store 32 into %ir.add.ptr, align 64) + BLR8 implicit $lr8, implicit $rm + + bb.7.for.body: + successors: %bb.7(0x7c000000), %bb.3(0x04000000) + + %5:uaccrc = PHI %16, %bb.2, %6, %bb.7 + %23:vsrc = COPY %8 + %25:accrc = COPY %5 + %24:accrc = XVF32GERPP %25, %23, %23 + %26:accrc = XVF32GERPP %24, %23, %23 + %27:accrc = XVF32GERPP %26, %23, %23 + %28:accrc = XVF32GERPP %27, %23, %23 + %29:accrc = XVF32GERPP %28, %23, %23 + %30:accrc = XVF32GERPP %29, %23, %23 + %31:accrc = XVF32GERPP %30, %23, %23 + %32:accrc = XVF32GERPP %31, %23, %23 + %6:uaccrc = COPY %32 + BDNZ8 %bb.7, implicit-def dead $ctr8, implicit $ctr8 + B %bb.3 + +...