diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -790,6 +790,9 @@ return true; } + bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, + unsigned &Cost) const override; + bool isCtlzFast() const override { return true; } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1621,6 +1621,35 @@ return VT.isScalarInteger(); } +bool PPCTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, + unsigned &Cost) const { + if (!Subtarget.isPPC64() || !Subtarget.hasVSX()) + return false; + // Expect extract one constant-index element from integer vector. + if (!isa(Idx)) + return false; + auto *VTy = dyn_cast(VectorTy); + if (!VTy || !VTy->getScalarType()->isIntegerTy()) + return false; + // The element index defines the region that can be stored without conversion. + // It depends on store width and endianness. + unsigned BitWidth = VTy->getScalarSizeInBits(); + unsigned ElemIdx; + if (BitWidth == 32 && Subtarget.hasP8Vector()) { + ElemIdx = Subtarget.isLittleEndian() ? 2 : 1; + } else if (BitWidth == 64) { + ElemIdx = Subtarget.isLittleEndian() ? 1 : 0; + } else + return false; + // Accept the combine only if the element index matches the index from extract + // operation. + if (cast(Idx)->getZExtValue() == ElemIdx) { + Cost = 1; + return true; + } + return false; +} + const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; diff --git a/llvm/test/CodeGen/PowerPC/p10-fi-elim.ll b/llvm/test/CodeGen/PowerPC/p10-fi-elim.ll --- a/llvm/test/CodeGen/PowerPC/p10-fi-elim.ll +++ b/llvm/test/CodeGen/PowerPC/p10-fi-elim.ll @@ -27,38 +27,36 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 80 ; CHECK-NEXT: .cfi_offset lr, 16 ; CHECK-NEXT: lxv v2, 0(r3) +; CHECK-NEXT: lbz r7, 2(r7) ; CHECK-NEXT: mr r9, r6 ; CHECK-NEXT: mr r6, r5 -; CHECK-NEXT: li r12, -127 -; CHECK-NEXT: li r0, 4 -; CHECK-NEXT: stb r12, 0(r3) -; CHECK-NEXT: li r2, 1 -; CHECK-NEXT: std r0, 0(r3) -; CHECK-NEXT: stw r2, 0(r3) -; CHECK-NEXT: li r11, 3 -; CHECK-NEXT: stb r11, 0(0) -; CHECK-NEXT: mfvsrd r5, v2 -; CHECK-NEXT: vaddudm v3, v2, v2 +; CHECK-NEXT: li r5, 3 +; CHECK-NEXT: li r10, -127 +; CHECK-NEXT: li r11, 4 +; CHECK-NEXT: stb r5, 0(0) +; CHECK-NEXT: li r12, 1 +; CHECK-NEXT: stb r10, 0(r3) +; CHECK-NEXT: std r11, 0(r3) +; CHECK-NEXT: stw r12, 0(r3) +; CHECK-NEXT: vnegd v3, v2 +; CHECK-NEXT: vaddudm v4, v2, v2 ; CHECK-NEXT: pstxv v2, 64(r1), 0 -; CHECK-NEXT: neg r5, r5 -; CHECK-NEXT: mfvsrd r10, v3 -; CHECK-NEXT: std r5, 0(r3) -; CHECK-NEXT: lbz r5, 2(r7) +; CHECK-NEXT: stxsd v3, 0(r3) +; CHECK-NEXT: stb r5, 0(r3) +; CHECK-NEXT: rlwinm r5, r7, 0, 27, 27 ; CHECK-NEXT: mr r7, r9 -; CHECK-NEXT: stb r11, 0(r3) -; CHECK-NEXT: stb r12, 0(r3) -; CHECK-NEXT: std r2, 0(r3) -; CHECK-NEXT: neg r10, r10 -; CHECK-NEXT: rlwinm r5, r5, 0, 27, 27 +; CHECK-NEXT: stb r10, 0(r3) +; CHECK-NEXT: std r12, 0(r3) +; CHECK-NEXT: vnegd v2, v4 ; CHECK-NEXT: stb r5, 0(0) ; CHECK-NEXT: lbz r5, 2(r8) ; CHECK-NEXT: rlwinm r5, r5, 0, 27, 27 ; CHECK-NEXT: stb r5, 0(r3) ; CHECK-NEXT: li r5, 2 -; CHECK-NEXT: std r0, 0(r3) +; CHECK-NEXT: std r11, 0(r3) ; CHECK-NEXT: stw r5, 0(r3) ; CHECK-NEXT: mr r5, r4 -; CHECK-NEXT: std r10, 0(r3) +; CHECK-NEXT: stxsd v2, 0(r3) ; CHECK-NEXT: bl foo@notoc ; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: addi r1, r1, 80