Index: lib/Target/PowerPC/PPCMIPeephole.cpp =================================================================== --- lib/Target/PowerPC/PPCMIPeephole.cpp +++ lib/Target/PowerPC/PPCMIPeephole.cpp @@ -400,9 +400,9 @@ case PPC::VSPLTH: case PPC::XXSPLTW: { unsigned MyOpcode = MI.getOpcode(); - unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2; + unsigned RegOpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2; unsigned TrueReg = - TII->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI); + TII->lookThruCopyLike(MI.getOperand(RegOpNo).getReg(), MRI); if (!TargetRegisterInfo::isVirtualRegister(TrueReg)) break; MachineInstr *DefMI = MRI->getVRegDef(TrueReg); @@ -433,32 +433,90 @@ DEBUG(MI.dump()); BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), MI.getOperand(0).getReg()) - .add(MI.getOperand(OpNo)); + .add(MI.getOperand(RegOpNo)); ToErase = &MI; Simplified = true; } - // Splat fed by a shift. Usually when we align value to splat into - // vector element zero. - if (DefOpcode == PPC::XXSLDWI) { - unsigned ShiftRes = DefMI->getOperand(0).getReg(); - unsigned ShiftOp1 = DefMI->getOperand(1).getReg(); - unsigned ShiftOp2 = DefMI->getOperand(2).getReg(); - unsigned ShiftImm = DefMI->getOperand(3).getImm(); - unsigned SplatImm = MI.getOperand(2).getImm(); - if (ShiftOp1 == ShiftOp2) { - unsigned NewElem = (SplatImm + ShiftImm) & 0x3; - if (MRI->hasOneNonDBGUse(ShiftRes)) { - DEBUG(dbgs() << "Removing redundant shift: "); - DEBUG(DefMI->dump()); - ToErase = DefMI; - } - Simplified = true; - DEBUG(dbgs() << "Changing splat immediate from " << SplatImm << - " to " << NewElem << " in instruction: "); + + // Get the op number for the immediate and then check to make sure + // that it is an immediate. + unsigned ImmOpNo = MyOpcode == PPC::XXSPLTW ? 2 : 1; + assert(MI.getOperand(ImmOpNo).isImm() && + "Operand should be an immediate."); + + // Deal with the situation where a splat is fed by either a shift + // or a swap. + // Spalt is fed by a SHIFT of the form + // XXSLDWI %VA, %VA, imm + // Splat is fed by a SWAP which is a permute of this form + // XXPERMDI %VA, %VA, 2 + // Since the splat instruction can use any of the vector elements to do + // the splat we do not have to rearrange the elements in the vector + // with a swap or shift before we do the splat. We can simply do the + // splat from a different index. + // If the swap or shift has only one use (the splat) then we can + // completely remove it. + if (DefOpcode == PPC::XXSLDWI || DefOpcode == PPC::XXPERMDI) { + unsigned DefRes = DefMI->getOperand(0).getReg(); + unsigned DefOp1 = DefMI->getOperand(1).getReg(); + unsigned DefOp2 = DefMI->getOperand(2).getReg(); + + // Note that in both cases, the immediate represents the + // number of words the input vector is rotated left. + // For example: xxsldwi x, y, y, 2 == xxpermdi x, y, y, 2. + unsigned DefImm = DefMI->getOperand(3).getImm(); + + unsigned SplatImm = MI.getOperand(ImmOpNo).getImm(); + + // If the two register operands differ, this isn't a + // shift/swap but a permutation of a pair of concatenated + // registers. + if (DefOp1 != DefOp2) + break; + + // The permute is not a swap so there is nothing we can do. + if (DefOpcode == PPC::XXPERMDI && DefImm != 2) + break; + + unsigned NewElem = 0; + // Compute the new index to use for the splat. + if (MI.getOpcode() == PPC::VSPLTB) + NewElem = (SplatImm + DefImm*4) & 0xF; + else if (MI.getOpcode() == PPC::VSPLTH) + NewElem = (SplatImm + DefImm*2) & 0x7; + else if (MI.getOpcode() == PPC::XXSPLTW) + NewElem = (SplatImm + DefImm) & 0x3; + else { + DEBUG(dbgs() << "Unknown splat opcode."); DEBUG(MI.dump()); - MI.getOperand(1).setReg(ShiftOp1); - MI.getOperand(2).setImm(NewElem); + break; + } + + if (MRI->hasOneNonDBGUse(DefRes)) { + DEBUG(dbgs() << "Removing redundant instruction: "); + DEBUG(DefMI->dump()); + ToErase = DefMI; + } + Simplified = true; + DEBUG(dbgs() << "Changing splat immediate from " << SplatImm << + " to " << NewElem << " in instruction: "); + DEBUG(MI.dump()); + + // Depending on the type of splat we may need a VSR or a VR register. + unsigned TmpReg = 0; + if (MI.getOpcode() == PPC::VSPLTB || MI.getOpcode() == PPC::VSPLTH) { + // Need a VR register so copy into one. + TmpReg = MF->getRegInfo().createVirtualRegister(&PPC::VRRCRegClass); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), TmpReg) + .add(DefMI->getOperand(1)); + } else { + // MI.getOpcode() == PPC::XXSPLTW + // Need a VSR register so just use the existing one. + TmpReg = DefOp1; } + + MI.getOperand(RegOpNo).setReg(TmpReg); + MI.getOperand(ImmOpNo).setImm(NewElem); } break; } Index: test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll =================================================================== --- test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll +++ test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll @@ -16,7 +16,7 @@ ; CHECK: sldi [[REG1:[0-9]+]], 3, 56 ; CHECK: mtvsrd {{[0-9]+}}, [[REG1]] ; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3 -; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]] +; CHECK-LE: vspltb {{[0-9]+}}, {{[0-9]+}}, 7 } ; Function Attrs: norecurse nounwind readnone @@ -28,7 +28,7 @@ ; CHECK: sldi [[REG1:[0-9]+]], 3, 48 ; CHECK: mtvsrd {{[0-9]+}}, [[REG1]] ; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3 -; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]] +; CHECK-LE: vsplth {{[0-9]+}}, {{[0-9]+}}, 3 } ; Function Attrs: norecurse nounwind readnone Index: test/CodeGen/PowerPC/ppc-xxsldwi-peephole.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/ppc-xxsldwi-peephole.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s + +; Function Attrs: norecurse nounwind readnone +define <16 x i8> @test(<16 x i8> %a, i8 zeroext %b) local_unnamed_addr { +entry: + %0 = bitcast <16 x i8> %a to <4 x i32> + %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> + %2 = bitcast <4 x i32> %1 to <16 x i8> + %add = add <16 x i8> %2, %a + %splat.splat = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> + %add1 = add <16 x i8> %add, %splat.splat + ret <16 x i8> %add1 +; CHECK-LABEL: test +; CHECK: vspltb +; CHECK: blr +} + Index: test/CodeGen/PowerPC/ppc64-peephole-swap.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/ppc64-peephole-swap.ll @@ -0,0 +1,122 @@ +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s --implicit-check-not xxswapd +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR8 --implicit-check-not xxswapd + +; The strightforward expansion of this code will result in a swap followed by a +; splat. However, the swap is not needed since in this case the splat is the +; only use. +; We want to check that we are not using the swap and that we have indexed the +; splat to the correct location. +; 8 Bit Signed Version of the test. +; Function Attrs: norecurse nounwind readnone +define <16 x i8> @splat_8_plus(<16 x i8> %v, i8 signext %c) local_unnamed_addr { +entry: + %splat.splatinsert.i = insertelement <16 x i8> undef, i8 %c, i32 0 + %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> undef, <16 x i32> zeroinitializer + %add = add <16 x i8> %splat.splat.i, %v + ret <16 x i8> %add +; CHECK-LABEL: splat_8_plus +; CHECK: vspltb {{[0-9]+}}, {{[0-9]+}}, 7 +; CHECK: blr +; CHECK-PWR8-LABEL: splat_8_plus +; CHECK-PWR8: vspltb {{[0-9]+}}, {{[0-9]+}}, 7 +; CHECK-PWR8: blr +} + +; 8 Bit Unsigned Version of the test. +; Function Attrs: norecurse nounwind readnone +define <16 x i8> @splat_u8_plus(<16 x i8> %v, i8 zeroext %c) local_unnamed_addr { +entry: + %splat.splatinsert.i = insertelement <16 x i8> undef, i8 %c, i32 0 + %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> undef, <16 x i32> zeroinitializer + %add = add <16 x i8> %splat.splat.i, %v + ret <16 x i8> %add +; CHECK-LABEL: splat_u8_plus +; CHECK: vspltb {{[0-9]+}}, {{[0-9]+}}, 7 +; CHECK: blr +; CHECK-PWR8-LABEL: splat_u8_plus +; CHECK-PWR8: vspltb {{[0-9]+}}, {{[0-9]+}}, 7 +; CHECK-PWR8: blr +} + +; 16 Bit Signed Version of the test. +; Function Attrs: norecurse nounwind readnone +define <8 x i16> @splat_16_plus(<8 x i16> %v, i16 signext %c) local_unnamed_addr { +entry: + %0 = shl i16 %c, 8 + %conv.i = ashr exact i16 %0, 8 + %splat.splatinsert.i = insertelement <8 x i16> undef, i16 %conv.i, i32 0 + %splat.splat.i = shufflevector <8 x i16> %splat.splatinsert.i, <8 x i16> undef, <8 x i32> zeroinitializer + %add = add <8 x i16> %splat.splat.i, %v + ret <8 x i16> %add +; CHECK-LABEL: splat_16_plus +; CHECK: vsplth {{[0-9]+}}, {{[0-9]+}}, 3 +; CHECK: blr +; CHECK-PWR8-LABEL: splat_16_plus +; CHECK-PWR8: vsplth {{[0-9]+}}, {{[0-9]+}}, 3 +; CHECK-PWR8: blr +} + +; 16 Bit Unsigned Version of the test. +; Function Attrs: norecurse nounwind readnone +define <8 x i16> @splat_u16_plus(<8 x i16> %v, i16 zeroext %c) local_unnamed_addr { +entry: + %0 = shl i16 %c, 8 + %conv.i = ashr exact i16 %0, 8 + %splat.splatinsert.i = insertelement <8 x i16> undef, i16 %conv.i, i32 0 + %splat.splat.i = shufflevector <8 x i16> %splat.splatinsert.i, <8 x i16> undef, <8 x i32> zeroinitializer + %add = add <8 x i16> %splat.splat.i, %v + ret <8 x i16> %add +; CHECK-LABEL: splat_u16_plus +; CHECK: vsplth {{[0-9]+}}, {{[0-9]+}}, 3 +; CHECK: blr +; CHECK-PWR8-LABEL: splat_u16_plus +; CHECK-PWR8: vsplth {{[0-9]+}}, {{[0-9]+}}, 3 +; CHECK-PWR8: blr +} + +; 32 Bit Signed Version of the test. +; The 32 bit examples work differently than the 8 and 16 bit versions of the +; test. On Power 9 we have the mtvsrws instruction that does both the move to +; register and the splat so it does not really test the newly implemented code. +; On Power 9 for the 32 bit case we don't need the new simplification. It is +; just here for completeness. +; Function Attrs: norecurse nounwind readnone +define <4 x i32> @splat_32_plus(<4 x i32> %v, i32 signext %c) local_unnamed_addr { +entry: + %sext = shl i32 %c, 24 + %conv.i = ashr exact i32 %sext, 24 + %splat.splatinsert.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0 + %splat.splat.i = shufflevector <4 x i32> %splat.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer + %add = add <4 x i32> %splat.splat.i, %v + ret <4 x i32> %add +; CHECK-LABEL: splat_32_plus +; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}} +; CHECK: blr +; CHECK-PWR8-LABEL: splat_32_plus +; CHECK-PWR8: xxspltw {{[0-9]+}}, {{[0-9]+}}, 1 +; CHECK-PWR8: blr +} + +; 32 Bit Unsigned Version of the test. +; The 32 bit examples work differently than the 8 and 16 bit versions of the +; test. On Power 9 we have the mtvsrws instruction that does both the move to +; register and the splat so it does not really test the newly implemented code. +; On Power 9 for the 32 bit case we don't need the new simplification. It is +; just here for completeness. +; Function Attrs: norecurse nounwind readnone +define <4 x i32> @splat_u32_plus(<4 x i32> %v, i32 zeroext %c) local_unnamed_addr { +entry: + %sext = shl i32 %c, 24 + %conv.i = ashr exact i32 %sext, 24 + %splat.splatinsert.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0 + %splat.splat.i = shufflevector <4 x i32> %splat.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer + %add = add <4 x i32> %splat.splat.i, %v + ret <4 x i32> %add +; CHECK-LABEL: splat_u32_plus +; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}} +; CHECK: blr +; CHECK-PWR8-LABEL: splat_u32_plus +; CHECK-PWR8: xxspltw {{[0-9]+}}, {{[0-9]+}}, 1 +; CHECK-PWR8: blr +} + Index: test/CodeGen/PowerPC/vsx.ll =================================================================== --- test/CodeGen/PowerPC/vsx.ll +++ test/CodeGen/PowerPC/vsx.ll @@ -1163,10 +1163,9 @@ ; CHECK-LE-LABEL: @test80 ; CHECK-LE-DAG: mtvsrd [[R1:[0-9]+]], 3 -; CHECK-LE-DAG: xxswapd [[V1:[0-9]+]], [[R1]] ; CHECK-LE-DAG: addi [[R2:[0-9]+]], {{[0-9]+}}, .LCPI ; CHECK-LE-DAG: lvx 3, 0, [[R2]] -; CHECK-LE-DAG: xxspltw 34, [[V1]] +; CHECK-LE-DAG: xxspltw 34, {{[0-9]+}} ; CHECK-LE-NOT: xxswapd 35, [[V2]] ; CHECK-LE: vadduwm 2, 2, 3 ; CHECK-LE: blr