Index: lib/Target/PowerPC/PPCMIPeephole.cpp =================================================================== --- lib/Target/PowerPC/PPCMIPeephole.cpp +++ lib/Target/PowerPC/PPCMIPeephole.cpp @@ -352,28 +352,69 @@ ToErase = &MI; Simplified = true; } - // Splat fed by a shift. Usually when we align value to splat into - // vector element zero. - if (DefOpcode == PPC::XXSLDWI) { - unsigned ShiftRes = DefMI->getOperand(0).getReg(); - unsigned ShiftOp1 = DefMI->getOperand(1).getReg(); - unsigned ShiftOp2 = DefMI->getOperand(2).getReg(); - unsigned ShiftImm = DefMI->getOperand(3).getImm(); - unsigned SplatImm = MI.getOperand(2).getImm(); - if (ShiftOp1 == ShiftOp2) { - unsigned NewElem = (SplatImm + ShiftImm) & 0x3; - if (MRI->hasOneNonDBGUse(ShiftRes)) { - DEBUG(dbgs() << "Removing redundant shift: "); - DEBUG(DefMI->dump()); - ToErase = DefMI; - } - Simplified = true; - DEBUG(dbgs() << "Changing splat immediate from " << SplatImm << - " to " << NewElem << " in instruction: "); + + // Get the op number for the immediate and then check to make sure + // that it is an immediate. + unsigned ImmOpNo = MyOpcode == PPC::XXSPLTW ? 2 : 1; + if (!MI.getOperand(ImmOpNo).isImm()) { + DEBUG(dbgs() << "Expected op number: " << ImmOpNo << + " to be an immediate.\n"); + DEBUG(MI.dump()); + break; + } + + // Deal with the situation where a splat is fed by either a shift + // or a swap. + // Spalt is fed by a SHIFT of the form + // XXSLDWI %VA, %VA, imm + // Splat is fed by a SWAP which is a permute of this form + // XXPERMDI %VA, %VA, 2 + // Since the splat instruction can use any of the vector elements to do + // the splat we do not have to rearrange the elements in the vector + // with a swap or shift before we do the splat. We can simply do the + // splat from a different index. + // If the swap or shift has only one use (the splat) then we can + // completely remove it. + if (DefOpcode == PPC::XXSLDWI || DefOpcode == PPC::XXPERMDI) { + unsigned DefRes = DefMI->getOperand(0).getReg(); + unsigned DefOp1 = DefMI->getOperand(1).getReg(); + unsigned DefOp2 = DefMI->getOperand(2).getReg(); + unsigned DefImm = DefMI->getOperand(3).getImm(); + unsigned SplatImm = MI.getOperand(ImmOpNo).getImm(); + + // Check that the two ops are equal. + if (DefOp1 != DefOp2) + break; + + // The permute is not a swap so there is nothing we can do. + if (DefOpcode == PPC::XXPERMDI && DefImm != 2) + break; + + unsigned NewElem = 0; + // Compute the new index to use for the splat. + if (MI.getOpcode() == PPC::VSPLTB) + NewElem = (SplatImm + DefImm*4) & 0xF; + else if (MI.getOpcode() == PPC::VSPLTH) + NewElem = (SplatImm + DefImm*2) & 0x7; + else if (MI.getOpcode() == PPC::XXSPLTW) + NewElem = (SplatImm + DefImm) & 0x3; + else { + DEBUG(dbgs() << "Unknown splat opcode."); DEBUG(MI.dump()); - MI.getOperand(1).setReg(ShiftOp1); - MI.getOperand(2).setImm(NewElem); + break; } + + if (MRI->hasOneNonDBGUse(DefRes)) { + DEBUG(dbgs() << "Removing redundant instruction: "); + DEBUG(DefMI->dump()); + ToErase = DefMI; + } + Simplified = true; + DEBUG(dbgs() << "Changing splat immediate from " << SplatImm << + " to " << NewElem << " in instruction: "); + DEBUG(MI.dump()); + MI.getOperand(OpNo).setReg(DefOp1); + MI.getOperand(ImmOpNo).setImm(NewElem); } break; } Index: test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll =================================================================== --- test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll +++ test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll @@ -16,7 +16,7 @@ ; CHECK: sldi [[REG1:[0-9]+]], 3, 56 ; CHECK: mtvsrd {{[0-9]+}}, [[REG1]] ; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3 -; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]] +; CHECK-LE: vspltb {{[0-9]+}}, {{[0-9]+}}, 7 } ; Function Attrs: norecurse nounwind readnone @@ -28,7 +28,7 @@ ; CHECK: sldi [[REG1:[0-9]+]], 3, 48 ; CHECK: mtvsrd {{[0-9]+}}, [[REG1]] ; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3 -; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]] +; CHECK-LE: vsplth {{[0-9]+}}, {{[0-9]+}}, 3 } ; Function Attrs: norecurse nounwind readnone Index: test/CodeGen/PowerPC/ppc-xxsldwi-peephole.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/ppc-xxsldwi-peephole.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s + +; Function Attrs: norecurse nounwind readnone +define <16 x i8> @test(<16 x i8> %a, i8 zeroext %b) local_unnamed_addr { +entry: + %0 = bitcast <16 x i8> %a to <4 x i32> + %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> + %2 = bitcast <4 x i32> %1 to <16 x i8> + %add = add <16 x i8> %2, %a + %splat.splat = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> + %add1 = add <16 x i8> %add, %splat.splat + ret <16 x i8> %add1 +; CHECK-LABEL: test +; CHECK: vspltb +; CHECK: blr +} + Index: test/CodeGen/PowerPC/ppc64-peephole-swap.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/ppc64-peephole-swap.ll @@ -0,0 +1,134 @@ +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR8 + +; The strightforward expansion of this code will result in a swap followed by a +; splat. However, the swap is not needed since in this case the splat is the +; only use. +; We want to check that we are not using the swap and that we have indexed the +; splat to the correct location. +; 8 Bit Signed Version of the test. +; Function Attrs: norecurse nounwind readnone +define <16 x i8> @splat_8_plus(<16 x i8> %v, i8 signext %c) local_unnamed_addr { +entry: + %splat.splatinsert.i = insertelement <16 x i8> undef, i8 %c, i32 0 + %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> undef, <16 x i32> zeroinitializer + %add = add <16 x i8> %splat.splat.i, %v + ret <16 x i8> %add +; CHECK-LABEL: splat_8_plus +; CHECK-NOT: xxswapd +; CHECK: vspltb {{[0-9]+}}, {{[0-9]+}}, 7 +; CHECK: blr +; CHECK-PWR8-LABEL: splat_8_plus +; CHECK-PWR8-NOT: xxswapd +; CHECK-PWR8: vspltb {{[0-9]+}}, {{[0-9]+}}, 7 +; CHECK-PWR8: blr +} + +; 8 Bit Unsigned Version of the test. +; Function Attrs: norecurse nounwind readnone +define <16 x i8> @splat_u8_plus(<16 x i8> %v, i8 zeroext %c) local_unnamed_addr { +entry: + %splat.splatinsert.i = insertelement <16 x i8> undef, i8 %c, i32 0 + %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> undef, <16 x i32> zeroinitializer + %add = add <16 x i8> %splat.splat.i, %v + ret <16 x i8> %add +; CHECK-LABEL: splat_u8_plus +; CHECK-NOT: xxswapd +; CHECK: vspltb {{[0-9]+}}, {{[0-9]+}}, 7 +; CHECK: blr +; CHECK-PWR8-LABEL: splat_u8_plus +; CHECK-PWR8-NOT: xxswapd +; CHECK-PWR8: vspltb {{[0-9]+}}, {{[0-9]+}}, 7 +; CHECK-PWR8: blr +} + +; 16 Bit Signed Version of the test. +; Function Attrs: norecurse nounwind readnone +define <8 x i16> @splat_16_plus(<8 x i16> %v, i16 signext %c) local_unnamed_addr { +entry: + %0 = shl i16 %c, 8 + %conv.i = ashr exact i16 %0, 8 + %splat.splatinsert.i = insertelement <8 x i16> undef, i16 %conv.i, i32 0 + %splat.splat.i = shufflevector <8 x i16> %splat.splatinsert.i, <8 x i16> undef, <8 x i32> zeroinitializer + %add = add <8 x i16> %splat.splat.i, %v + ret <8 x i16> %add +; CHECK-LABEL: splat_16_plus +; CHECK-NOT: xxswapd +; CHECK: vsplth {{[0-9]+}}, {{[0-9]+}}, 3 +; CHECK: blr +; CHECK-PWR8-LABEL: splat_16_plus +; CHECK-PWR8-NOT: xxswapd +; CHECK-PWR8: vsplth {{[0-9]+}}, {{[0-9]+}}, 3 +; CHECK-PWR8: blr +} + +; 16 Bit Unsigned Version of the test. +; Function Attrs: norecurse nounwind readnone +define <8 x i16> @splat_u16_plus(<8 x i16> %v, i16 zeroext %c) local_unnamed_addr { +entry: + %0 = shl i16 %c, 8 + %conv.i = ashr exact i16 %0, 8 + %splat.splatinsert.i = insertelement <8 x i16> undef, i16 %conv.i, i32 0 + %splat.splat.i = shufflevector <8 x i16> %splat.splatinsert.i, <8 x i16> undef, <8 x i32> zeroinitializer + %add = add <8 x i16> %splat.splat.i, %v + ret <8 x i16> %add +; CHECK-LABEL: splat_u16_plus +; CHECK-NOT: xxswapd +; CHECK: vsplth {{[0-9]+}}, {{[0-9]+}}, 3 +; CHECK: blr +; CHECK-PWR8-LABEL: splat_u16_plus +; CHECK-PWR8-NOT: xxswapd +; CHECK-PWR8: vsplth {{[0-9]+}}, {{[0-9]+}}, 3 +; CHECK-PWR8: blr +} + +; 32 Bit Signed Version of the test. +; The 32 bit examples work differently than the 8 and 16 bit versions of the +; test. On Power 9 we have the mtvsrws instruction that does both the move to +; register and the splat so it does not really test the newly implemented code. +; On Power 9 for the 32 bit case we don't need the new simplification. It is +; just here for completeness. +; Function Attrs: norecurse nounwind readnone +define <4 x i32> @splat_32_plus(<4 x i32> %v, i32 signext %c) local_unnamed_addr { +entry: + %sext = shl i32 %c, 24 + %conv.i = ashr exact i32 %sext, 24 + %splat.splatinsert.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0 + %splat.splat.i = shufflevector <4 x i32> %splat.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer + %add = add <4 x i32> %splat.splat.i, %v + ret <4 x i32> %add +; CHECK-LABEL: splat_32_plus +; CHECK-NOT: xxswapd +; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}} +; CHECK: blr +; CHECK-PWR8-LABEL: splat_32_plus +; CHECK-PWR8-NOT: xxswapd +; CHECK-PWR8: xxspltw {{[0-9]+}}, {{[0-9]+}}, 1 +; CHECK-PWR8: blr +} + +; 32 Bit Unsigned Version of the test. +; The 32 bit examples work differently than the 8 and 16 bit versions of the +; test. On Power 9 we have the mtvsrws instruction that does both the move to +; register and the splat so it does not really test the newly implemented code. +; On Power 9 for the 32 bit case we don't need the new simplification. It is +; just here for completeness. +; Function Attrs: norecurse nounwind readnone +define <4 x i32> @splat_u32_plus(<4 x i32> %v, i32 zeroext %c) local_unnamed_addr { +entry: + %sext = shl i32 %c, 24 + %conv.i = ashr exact i32 %sext, 24 + %splat.splatinsert.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0 + %splat.splat.i = shufflevector <4 x i32> %splat.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer + %add = add <4 x i32> %splat.splat.i, %v + ret <4 x i32> %add +; CHECK-LABEL: splat_u32_plus +; CHECK-NOT: xxswapd +; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}} +; CHECK: blr +; CHECK-PWR8-LABEL: splat_u32_plus +; CHECK-PWR8-NOT: xxswapd +; CHECK-PWR8: xxspltw {{[0-9]+}}, {{[0-9]+}}, 1 +; CHECK-PWR8: blr +} + Index: test/CodeGen/PowerPC/vsx.ll =================================================================== --- test/CodeGen/PowerPC/vsx.ll +++ test/CodeGen/PowerPC/vsx.ll @@ -1163,10 +1163,9 @@ ; CHECK-LE-LABEL: @test80 ; CHECK-LE-DAG: mtvsrd [[R1:[0-9]+]], 3 -; CHECK-LE-DAG: xxswapd [[V1:[0-9]+]], [[R1]] ; CHECK-LE-DAG: addi [[R2:[0-9]+]], {{[0-9]+}}, .LCPI ; CHECK-LE-DAG: lvx 3, 0, [[R2]] -; CHECK-LE-DAG: xxspltw 34, [[V1]] +; CHECK-LE-DAG: xxspltw 34, {{[0-9]+}} ; CHECK-LE-NOT: xxswapd 35, [[V2]] ; CHECK-LE: vadduwm 2, 2, 3 ; CHECK-LE: blr