Index: llvm/lib/Target/PowerPC/PPCMIPeephole.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -435,28 +435,178 @@ ToErase = &MI; Simplified = true; } - // Splat fed by a shift. Usually when we align value to splat into - // vector element zero. - if (DefOpcode == PPC::XXSLDWI) { + + // Word splats fed by permutations/shifts. The following cases apply: + // 1) Simplify the following sequence: load->xxpermdi->splt and emit + // a splatting load (LXVWSX). This is performed on POWER 9. + // 2) Simplify a splat fed by a permute (eg. xxpermdi->splt) and + // remove the permute because it is redundant. + bool LoadSplatted = false; + + // Types of permutes or shifts allowed to be immediately fed by a splat. + bool isSupportedPermuteOp = (DefOpcode == PPC::XXPERMDIs) || + (DefOpcode == PPC::XXSLDWIs) || (DefOpcode == PPC::XXPERMDI) || + (DefOpcode == PPC::XXSLDWI); + + // Types of loads allowed to preceed a splat, in the situation of + // emitting a splatting load (ld->splt or ld->perm->splat => LXVWSX). + auto isSupportedLoadOp = [=](unsigned opCode) -> bool { + return (opCode == PPC::LIWZX || opCode == PPC::LIWAX || + opCode == PPC::LXSDX || opCode == PPC::LXSIBZX || + opCode == PPC::LXSIHZX || opCode == PPC::LXSSPX || + opCode == PPC::LXVX || opCode == PPC::LXVB16X || + opCode == PPC::LXVD2X || opCode == PPC::LXVDSX || + opCode == PPC::LXVH8X || opCode == PPC::LXVL || + opCode == PPC::LXVLL || opCode == PPC::LXVW4X); + }; + + // Detect a load -> permute -> splat sequence. + auto isLoadPermAndSplat = [=]() -> bool { + if (MF->getSubtarget().hasP9Vector() && + MyOpcode == PPC::XXSPLTW && isSupportedPermuteOp) { + // Retrieve the instruction that precedes the splat (represented as + // DefMI) and ensure that it is a valid (permute) instruction. + unsigned PermuteReg = + TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI); + + if (TargetRegisterInfo::isVirtualRegister(PermuteReg)) { + // The instruction that preceeds the splat should be a load. + MachineInstr *LoadMI = MRI->getVRegDef(PermuteReg); + if (LoadMI && isSupportedLoadOp(LoadMI->getOpcode())) + return true; + } + } + return false; + }; + + // Detect a load -> splat sequence. + bool isLoadAndSplat = (MF->getSubtarget().hasP9Vector() + && MyOpcode == PPC::XXSPLTW + && isSupportedLoadOp(DefOpcode)); + + // CASE 1: (P9 ONY) Emit a splatting load in the following sequence: + // load->swap/shift/any permutation->splat, or load->splat. + // This case covers when the permutations are of XXPERMDI, XXPERMDIs, + // as well as shifts (XXSLDWIs, XXSLDWI). + if (isLoadPermAndSplat()) { + + // Retrieve the load (LoadMI) that precedes the splat instruction. + // Checking if the load is valid is handled in isLoadPermAndSplat(). + unsigned SwapReg = + TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI); + MachineInstr *LoadMI = MRI->getVRegDef(SwapReg); + + // Get the results of the permute and the load instruction to + // check if they are of single use. + unsigned PermRes = DefMI->getOperand(0).getReg(); + unsigned LoadRes = LoadMI->getOperand(0).getReg(); + + // If the we have: load->perm->splat, emit a splatting load instead. + // Change the load into a splatting load, if it has a single use. + if (MRI->hasOneNonDBGUse(LoadRes)) { + LLVM_DEBUG(dbgs() << "Optimizing ld/swp/splat => splatting load. "); + LLVM_DEBUG(LoadMI->dump()); + + LoadMI->setDesc(TII->get(PPC::LXVWSX)); + MRI->setRegClass(LoadMI->getOperand(0).getReg(), &PPC::VSRCRegClass); + + // Change the splat into a copy, and ensure the result of the + // splatting load gets forwarded into the users of the splat. + MI.setDesc(TII->get(PPC::COPY)); + MI.getOperand(1).setReg(LoadMI->getOperand(0).getReg()); + MI.RemoveOperand(2); + } + + // Remove the permute operation (DefMI) if it has a single use. + if (MRI->hasOneNonDBGUse(PermRes)) { + LLVM_DEBUG(dbgs() << "Removing redundant permute: "); + LLVM_DEBUG(DefMI->dump()); + ToErase = DefMI; + } + Simplified = true; + LoadSplatted = true; + } + else if (isLoadAndSplat) { + // Emit a splatting load if the load (DefMI) has a single use. + unsigned LoadRes = DefMI->getOperand(0).getReg(); + if (MRI->hasOneNonDBGUse(LoadRes)) { + LLVM_DEBUG(dbgs() << "Optimizing load/splat => splatting load"); + LLVM_DEBUG(DefMI->dump()); + DefMI->setDesc(TII->get(PPC::LXVWSX)); + MRI->setRegClass(DefMI->getOperand(0).getReg(), &PPC::VSRCRegClass); + + // Change the splat (MI) to COPY to forward the splatting load's + // result to the users of the splat. + MI.setDesc(TII->get(PPC::COPY)); + MI.getOperand(1).setReg(DefMI->getOperand(0).getReg()); + MI.RemoveOperand(2); + } + Simplified = true; + LoadSplatted = true; + } + + // CASE 2: (P8, P9) Check for a splat fed by a shift, or permute; + // usually when we align value to splat into vector element zero. + // The splat can select any of the vector elements to perform the splat, + // so a shift/permute of the vector elements prior to splatting is + // unnecessary. Keep track of where the element was prior to the splat + // and splat that element instead. + if (!LoadSplatted && MyOpcode == PPC::XXSPLTW && isSupportedPermuteOp) { + // Save the necessary operands of the permute/shift and splat to be + // later used to keep track of where the element was before. unsigned ShiftRes = DefMI->getOperand(0).getReg(); unsigned ShiftOp1 = DefMI->getOperand(1).getReg(); - unsigned ShiftOp2 = DefMI->getOperand(2).getReg(); - unsigned ShiftImm = DefMI->getOperand(3).getImm(); unsigned SplatImm = MI.getOperand(2).getImm(); - if (ShiftOp1 == ShiftOp2) { - unsigned NewElem = (SplatImm + ShiftImm) & 0x3; - if (MRI->hasOneNonDBGUse(ShiftRes)) { - LLVM_DEBUG(dbgs() << "Removing redundant shift: "); - LLVM_DEBUG(DefMI->dump()); - ToErase = DefMI; - } - Simplified = true; - LLVM_DEBUG(dbgs() << "Changing splat immediate from " << SplatImm - << " to " << NewElem << " in instruction: "); - LLVM_DEBUG(MI.dump()); - MI.getOperand(1).setReg(ShiftOp1); - MI.getOperand(2).setImm(NewElem); + unsigned ShiftOp2; + unsigned ShiftImm; + + // Depending on if we have a double loading operand version of + // the permute/shift, save that operand as well, and check if they + // are equal before proceeding with the transformation. + if (DefOpcode == PPC::XXPERMDI || DefOpcode == PPC::XXSLDWI) { + ShiftOp2 = DefMI->getOperand(2).getReg(); + ShiftImm = DefMI->getOperand(3).getImm(); + // The transformation is not performed if the operands differ. + if (ShiftOp1 != ShiftOp2) + break; + } else if (DefOpcode == PPC::XXPERMDIs || DefOpcode == PPC::XXSLDWIs) { + ShiftImm = DefMI->getOperand(2).getImm(); } + + // Find the new element index for the splat instruction to use. + unsigned NewElem = (SplatImm + ShiftImm) & 0x3; + if (MRI->hasOneNonDBGUse(ShiftRes)) { + LLVM_DEBUG(dbgs() << "Removing redundant shift/permute: "); + LLVM_DEBUG(DefMI->dump()); + ToErase = DefMI; + } + Simplified = true; + LLVM_DEBUG(dbgs() << "Changing splat immediate from " << SplatImm + << " to " << NewElem << " in instruction: "); + LLVM_DEBUG(MI.dump()); + + // Depending on the VSX register class of the shift/permute's input + // operand, ensure that it matches the class of the splat (VSRC). + unsigned NewReg = + MF->getRegInfo().createVirtualRegister(&PPC::VSRCRegClass); + + if (MRI->getRegClass(ShiftOp1) == &PPC::VSSRCRegClass || + MRI->getRegClass(ShiftOp1) == &PPC::VSFRCRegClass) { + // Use SUBREG_TO_REG to insert the input value into the proper + // register subclass when the input is a VSSRC or VSFRC. + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::SUBREG_TO_REG), + NewReg) + .addImm(1) + .add(DefMI->getOperand(1)) + .addImm(PPC::sub_64); + } else { + // Copy the input into a new VSX register of the correct subclass. + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), NewReg) + .add(DefMI->getOperand(1)); + } + + MI.getOperand(1).setReg(NewReg); + MI.getOperand(2).setImm(NewElem); } break; } Index: llvm/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll =================================================================== --- llvm/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll +++ llvm/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll @@ -13,8 +13,7 @@ ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8: lfiwzx f0, 0, r3 ; CHECK-P8: ld r4, .LC0@toc@l(r4) -; CHECK-P8: xxpermdi vs0, f0, f0, 2 -; CHECK-P8: xxspltw v2, vs0, 3 +; CHECK-P8: xxspltw v2, vs0, 1 ; CHECK-P8: stvx v2, 0, r4 ; CHECK-P8: lis r4, 1024 ; CHECK-P8: lfiwax f0, 0, r3 @@ -27,11 +26,9 @@ ; ; CHECK-P9-LABEL: testExpandPostRAPseudo: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9: lfiwzx f0, 0, r3 ; CHECK-P9: addis r4, r2, .LC0@toc@ha ; CHECK-P9: ld r4, .LC0@toc@l(r4) -; CHECK-P9: xxpermdi vs0, f0, f0, 2 -; CHECK-P9: xxspltw vs0, vs0, 3 +; CHECK-P9: lxvwsx vs0, 0, r3 ; CHECK-P9: stxvx vs0, 0, r4 ; CHECK-P9: lis r4, 1024 ; CHECK-P9: lfiwax f0, 0, r3 Index: llvm/test/CodeGen/PowerPC/build-vector-tests.ll =================================================================== --- llvm/test/CodeGen/PowerPC/build-vector-tests.ll +++ llvm/test/CodeGen/PowerPC/build-vector-tests.ll @@ -109,8 +109,8 @@ ;vector int spltRegVali(int val) { // ; return (vector int) val; // ;} // -;// P8: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw // -;// P9: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw // +;// P8: (LE) lfiwzx, xxspltw (BE): lfiwzx, xxspltw // +;// P9: (LE) lxvwsx (BE): lxvwsx // ;vector int spltMemVali(int *ptr) { // ; return (vector int)*ptr; // ;} // @@ -284,8 +284,8 @@ ;vector unsigned int spltRegValui(unsigned int val) { // ; return (vector unsigned int) val; // ;} // -;// P8: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw // -;// P9: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw // +;// P8: (LE) lfiwzx, xxspltw (BE): lfiwzx, xxspltw // +;// P9: (LE) lxvwsx (BE): lxvwsx // ;vector unsigned int spltMemValui(unsigned int *ptr) { // ; return (vector unsigned int)*ptr; // ;} // @@ -1202,21 +1202,15 @@ ; P9LE-LABEL: spltMemVali ; P8BE-LABEL: spltMemVali ; P8LE-LABEL: spltMemVali -; P9BE: lfiwzx f0, 0, r3 -; P9BE: xxsldwi vs0, f0, f0, 1 -; P9BE: xxspltw v2, vs0, 0 +; P9BE: lxvwsx v2, 0, r3 ; P9BE: blr -; P9LE: lfiwzx f0, 0, r3 -; P9LE: xxpermdi vs0, f0, f0, 2 -; P9LE: xxspltw v2, vs0, 3 +; P9LE: lxvwsx v2, 0, r3 ; P9LE: blr ; P8BE: lfiwzx f0, 0, r3 -; P8BE: xxsldwi vs0, f0, f0, 1 -; P8BE: xxspltw v2, vs0, 0 +; P8BE: xxspltw v2, vs0, 1 ; P8BE: blr ; P8LE: lfiwzx f0, 0, r3 -; P8LE: xxpermdi vs0, f0, f0, 2 -; P8LE: xxspltw v2, vs0, 3 +; P8LE: xxspltw v2, vs0, 1 ; P8LE: blr } @@ -2344,21 +2338,15 @@ ; P9LE-LABEL: spltMemValui ; P8BE-LABEL: spltMemValui ; P8LE-LABEL: spltMemValui -; P9BE: lfiwzx f0, 0, r3 -; P9BE: xxsldwi vs0, f0, f0, 1 -; P9BE: xxspltw v2, vs0, 0 +; P9BE: lxvwsx v2, 0, r3 ; P9BE: blr -; P9LE: lfiwzx f0, 0, r3 -; P9LE: xxpermdi vs0, f0, f0, 2 -; P9LE: xxspltw v2, vs0, 3 +; P9LE: lxvwsx v2, 0, r3 ; P9LE: blr ; P8BE: lfiwzx f0, 0, r3 -; P8BE: xxsldwi vs0, f0, f0, 1 -; P8BE: xxspltw v2, vs0, 0 +; P8BE: xxspltw v2, vs0, 1 ; P8BE: blr ; P8LE: lfiwzx f0, 0, r3 -; P8LE: xxpermdi vs0, f0, f0, 2 -; P8LE: xxspltw v2, vs0, 3 +; P8LE: xxspltw v2, vs0, 1 ; P8LE: blr } Index: llvm/test/CodeGen/PowerPC/load-v4i8-improved.ll =================================================================== --- llvm/test/CodeGen/PowerPC/load-v4i8-improved.ll +++ llvm/test/CodeGen/PowerPC/load-v4i8-improved.ll @@ -1,23 +1,30 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s \ -; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck --check-prefix=CHECK-LE \ -; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s -; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s \ -; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck \ -; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s +; RUN: llc -verify-machineinstrs -mcpu=pwr8 \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s -ppc-vsr-nums-as-vr \ +; RUN: -ppc-asm-full-reg-names | FileCheck -implicit-check-not vmrg \ +; RUN: -implicit-check-not=vperm %s +; RUN: llc -verify-machineinstrs -mcpu=pwr8 \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s -ppc-vsr-nums-as-vr \ +; RUN: -ppc-asm-full-reg-names | FileCheck -implicit-check-not vmrg \ +; RUN: -implicit-check-not=vperm %s +; RUN: llc -verify-machineinstrs -mcpu=pwr9 \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s -ppc-vsr-nums-as-vr \ +; RUN: -ppc-asm-full-reg-names | FileCheck --check-prefix=CHECK-P9 \ +; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s +; RUN: llc -verify-machineinstrs -mcpu=pwr9 \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s -ppc-vsr-nums-as-vr \ +; RUN: -ppc-asm-full-reg-names | FileCheck --check-prefix=CHECK-P9 \ +; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s define <16 x i8> @test(i32* %s, i32* %t) { -; CHECK-LE-LABEL: test: -; CHECK-LE: # %bb.0: # %entry -; CHECK-LE-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-NEXT: xxpermdi vs0, f0, f0, 2 -; CHECK-LE-NEXT: xxspltw v2, vs0, 3 -; CHECK-LE-NEXT: blr +; CHECK-P9-LABEL: test: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-P9-NEXT: blr ; CHECK-LABEL: test: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lfiwzx f0, 0, r3 -; CHECK-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-NEXT: xxspltw v2, vs0, 0 +; CHECK-NEXT: xxspltw v2, vs0, 1 ; CHECK-NEXT: blr entry: %0 = bitcast i32* %s to <4 x i8>* Index: llvm/test/CodeGen/PowerPC/power9-moves-and-splats.ll =================================================================== --- llvm/test/CodeGen/PowerPC/power9-moves-and-splats.ll +++ llvm/test/CodeGen/PowerPC/power9-moves-and-splats.ll @@ -57,16 +57,12 @@ define <4 x i32> @test4(i32* nocapture readonly %in) { ; CHECK-LABEL: test4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lfiwzx f0, 0, r3 -; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 -; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: lxvwsx v2, 0, r3 ; CHECK-NEXT: blr ; CHECK-BE-LABEL: test4: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-NEXT: xxspltw v2, vs0, 0 +; CHECK-BE-NEXT: lxvwsx v2, 0, r3 ; CHECK-BE-NEXT: blr entry: %0 = load i32, i32* %in, align 4 @@ -78,16 +74,12 @@ define <4 x float> @test5(float* nocapture readonly %in) { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lfiwzx f0, 0, r3 -; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 -; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: lxvwsx v2, 0, r3 ; CHECK-NEXT: blr ; CHECK-BE-LABEL: test5: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-NEXT: xxspltw v2, vs0, 0 +; CHECK-BE-NEXT: lxvwsx v2, 0, r3 ; CHECK-BE-NEXT: blr entry: %0 = load float, float* %in, align 4 @@ -101,18 +93,14 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r3, r2, .LC0@toc@ha ; CHECK-NEXT: ld r3, .LC0@toc@l(r3) -; CHECK-NEXT: lfiwzx f0, 0, r3 -; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 -; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: lxvwsx v2, 0, r3 ; CHECK-NEXT: blr ; CHECK-BE-LABEL: test6: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r3, r2, .LC0@toc@ha ; CHECK-BE-NEXT: ld r3, .LC0@toc@l(r3) -; CHECK-BE-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-NEXT: xxspltw v2, vs0, 0 +; CHECK-BE-NEXT: lxvwsx v2, 0, r3 ; CHECK-BE-NEXT: blr entry: %0 = load i32, i32* @Globi, align 4 @@ -126,18 +114,14 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r3, r2, .LC1@toc@ha ; CHECK-NEXT: ld r3, .LC1@toc@l(r3) -; CHECK-NEXT: lfiwzx f0, 0, r3 -; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 -; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: lxvwsx v2, 0, r3 ; CHECK-NEXT: blr ; CHECK-BE-LABEL: test7: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r3, r2, .LC1@toc@ha ; CHECK-BE-NEXT: ld r3, .LC1@toc@l(r3) -; CHECK-BE-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-NEXT: xxspltw v2, vs0, 0 +; CHECK-BE-NEXT: lxvwsx v2, 0, r3 ; CHECK-BE-NEXT: blr entry: %0 = load float, float* @Globf, align 4 Index: llvm/test/CodeGen/PowerPC/qpx-load-splat.ll =================================================================== --- llvm/test/CodeGen/PowerPC/qpx-load-splat.ll +++ llvm/test/CodeGen/PowerPC/qpx-load-splat.ll @@ -1,13 +1,25 @@ ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -ppc-vsr-nums-as-vr \ ; RUN: -ppc-asm-full-reg-names -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names -verify-machineinstrs < %s | \ +; RUN: FileCheck --check-prefix=CHECK-P9 %s +; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names -verify-machineinstrs < %s | \ +; RUN: FileCheck --check-prefix=CHECK-P9 %s ; Function Attrs: norecurse nounwind readonly define <4 x double> @foo(double* nocapture readonly %a) #0 { ; CHECK-LABEL: foo: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lxvdsx v2, 0, r3 -; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: blr +; CHECK: # %bb.0: # %entry +; CHECK: lxvdsx v2, 0, r3 +; CHECK: vmr v3, v2 +; CHECK: blr + +; CHECK-P9-LABEL: foo: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9: lxvdsx v2, 0, r3 +; CHECK-P9: vmr v3, v2 +; CHECK-P9: blr entry: %0 = load double, double* %a, align 8 %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 @@ -17,11 +29,18 @@ define <4 x double> @foox(double* nocapture readonly %a, i64 %idx) #0 { ; CHECK-LABEL: foox: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sldi r4, r4, 3 -; CHECK-NEXT: lxvdsx v2, r3, r4 -; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: blr +; CHECK: # %bb.0: # %entry +; CHECK: sldi r4, r4, 3 +; CHECK: lxvdsx v2, r3, r4 +; CHECK: vmr v3, v2 +; CHECK: blr + +; CHECK-P9-LABEL: foox: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9: sldi r4, r4, 3 +; CHECK-P9: lxvdsx v2, r3, r4 +; CHECK-P9: vmr v3, v2 +; CHECK-P9: blr entry: %p = getelementptr double, double* %a, i64 %idx %0 = load double, double* %p, align 8 @@ -32,13 +51,22 @@ define <4 x double> @fooxu(double* nocapture readonly %a, i64 %idx, double** %pptr) #0 { ; CHECK-LABEL: fooxu: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sldi r4, r4, 3 -; CHECK-NEXT: lfdux f0, r3, r4 -; CHECK-NEXT: xxspltd v2, vs0, 0 -; CHECK-NEXT: std r3, 0(r5) -; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: blr +; CHECK: # %bb.0: # %entry +; CHECK: sldi r4, r4, 3 +; CHECK: lfdux f0, r3, r4 +; CHECK: xxspltd v2, vs0, 0 +; CHECK: std r3, 0(r5) +; CHECK: vmr v3, v2 +; CHECK: blr + +; CHECK-P9-LABEL: fooxu: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9: sldi r4, r4, 3 +; CHECK-P9: lfdux f0, r3, r4 +; CHECK-P9: std r3, 0(r5) +; CHECK-P9: xxspltd v2, vs0, 0 +; CHECK-P9: vmr v3, v2 +; CHECK-P9: blr entry: %p = getelementptr double, double* %a, i64 %idx %0 = load double, double* %p, align 8 @@ -51,10 +79,14 @@ define <4 x float> @foof(float* nocapture readonly %a) #0 { ; CHECK-LABEL: foof: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lfiwzx f0, 0, r3 -; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 -; CHECK-NEXT: xxspltw v2, vs0, 3 -; CHECK-NEXT: blr +; CHECK: lfiwzx f0, 0, r3 +; CHECK-NEXT: xxspltw v2, vs0, 1 +; CHECK: blr + +; CHECK-P9-LABEL: foof: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9: lxvwsx v2, 0, r3 +; CHECK-P9: blr entry: %0 = load float, float* %a, align 4 %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 @@ -65,11 +97,16 @@ define <4 x float> @foofx(float* nocapture readonly %a, i64 %idx) #0 { ; CHECK-LABEL: foofx: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sldi r4, r4, 2 -; CHECK-NEXT: lfiwzx f0, r3, r4 -; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 -; CHECK-NEXT: xxspltw v2, vs0, 3 -; CHECK-NEXT: blr +; CHECK: sldi r4, r4, 2 +; CHECK: lfiwzx f0, r3, r4 +; CHECK-NEXT: xxspltw v2, vs0, 1 +; CHECK: blr + +; CHECK-P9-LABEL: foofx: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9: sldi r4, r4, 2 +; CHECK-P9: lxvwsx v2, r3, r4 +; CHECK-P9: blr entry: %p = getelementptr float, float* %a, i64 %idx %0 = load float, float* %p, align 4 Index: llvm/test/CodeGen/PowerPC/vsx.ll =================================================================== --- llvm/test/CodeGen/PowerPC/vsx.ll +++ llvm/test/CodeGen/PowerPC/vsx.ll @@ -1173,10 +1173,10 @@ ; CHECK-LE-LABEL: @test80 ; CHECK-LE-DAG: mtvsrd f0, r3 -; CHECK-LE-DAG: xxswapd vs0, vs0 +; CHECK-LE-DAG: addis r4, r2, .LCPI65_0@toc@ha ; CHECK-LE-DAG: addi r3, r4, .LCPI65_0@toc@l ; CHECK-LE-DAG: lvx v3, 0, r3 -; CHECK-LE-DAG: xxspltw v2, vs0, 3 +; CHECK-LE-DAG: xxspltw v2, vs0, 1 ; CHECK-LE-NOT: xxswapd v3, ; CHECK-LE: vadduwm v2, v2, v3 ; CHECK-LE: blr