Skip to content

Commit

Permalink
[PowerPC] Try harder to avoid load/move-to VSR for partial vector loads
Browse files Browse the repository at this point in the history
Change the PPCISelLowering.cpp function that decides to avoid update form in
favor of partial vector loads to know about newer load types and to not be
confused by the chain operand.

Differential Revision: https://reviews.llvm.org/D60102

llvm-svn: 359504
  • Loading branch information
RolandF77 committed Apr 29, 2019
1 parent 58b1663 commit 728e139
Show file tree
Hide file tree
Showing 2 changed files with 172 additions and 37 deletions.
51 changes: 36 additions & 15 deletions llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2428,24 +2428,45 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,

/// Returns true if we should use a direct load into vector instruction
/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
static bool usePartialVectorLoads(SDNode *N) {
if (!N->hasOneUse())
return false;

static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {

// If there are any other uses other than scalar to vector, then we should
// keep it as a scalar load -> direct move pattern to prevent multiple
// loads. Currently, only check for i64 since we have lxsd/lfd to do this
// efficiently, but no update equivalent.
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
EVT MemVT = LD->getMemoryVT();
if (MemVT.isSimple() && MemVT.getSimpleVT().SimpleTy == MVT::i64) {
SDNode *User = *(LD->use_begin());
if (User->getOpcode() == ISD::SCALAR_TO_VECTOR)
return true;
}
// loads.
LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
if (!LD)
return false;

EVT MemVT = LD->getMemoryVT();
if (!MemVT.isSimple())
return false;
switch(MemVT.getSimpleVT().SimpleTy) {
case MVT::i64:
break;
case MVT::i32:
if (!ST.hasP8Vector())
return false;
break;
case MVT::i16:
case MVT::i8:
if (!ST.hasP9Vector())
return false;
break;
default:
return false;
}

return false;
SDValue LoadedVal(N, 0);
if (!LoadedVal.hasOneUse())
return false;

for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();
UI != UE; ++UI)
if (UI.getUse().get().getResNo() == 0 &&
UI->getOpcode() != ISD::SCALAR_TO_VECTOR)
return false;

return true;
}

/// getPreIndexedAddressParts - returns true by value, base pointer and
Expand Down Expand Up @@ -2476,7 +2497,7 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
// Do not generate pre-inc forms for specific loads that feed scalar_to_vector
// instructions because we can fold these into a more efficient instruction
// instead, (such as LXSD).
if (isLoad && usePartialVectorLoads(N)) {
if (isLoad && usePartialVectorLoads(N, Subtarget)) {
return false;
}

Expand Down
158 changes: 136 additions & 22 deletions llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mcpu=pwr9 -O3 -verify-machineinstrs -ppc-vsr-nums-as-vr \
; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: < %s | FileCheck %s
Expand Down Expand Up @@ -30,26 +29,25 @@ define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 sig
; CHECK-NEXT: xvnegsp v0, v1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_1: # %for.cond1.preheader
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: lfd f0, 0(r3)
; CHECK: lfd f0, 0(r3)
; CHECK-NEXT: xxpermdi v1, f0, f0, 2
; CHECK-NEXT: vperm v6, v3, v1, v2
; CHECK-NEXT: vperm v1, v1, v3, v4
; CHECK-NEXT: xvnegsp v6, v6
; CHECK-NEXT: vperm v6, v1, v3, v4
; CHECK-NEXT: vperm v1, v3, v1, v2
; CHECK-NEXT: xvnegsp v1, v1
; CHECK-NEXT: vabsduw v6, v6, v5
; CHECK-NEXT: vabsduw v1, v1, v0
; CHECK-NEXT: vadduwm v1, v1, v6
; CHECK-NEXT: xvnegsp v6, v6
; CHECK-NEXT: vabsduw v1, v1, v5
; CHECK-NEXT: vabsduw v6, v6, v0
; CHECK-NEXT: vadduwm v1, v6, v1
; CHECK-NEXT: xxswapd v6, v1
; CHECK-NEXT: vadduwm v1, v1, v6
; CHECK-NEXT: xxspltw v6, v1, 2
; CHECK-NEXT: vadduwm v1, v1, v6
; CHECK-NEXT: vextuwrx r7, r5, v1
; CHECK-NEXT: ldux r8, r3, r4
; CHECK-NEXT: add r3, r3, r4
; CHECK-NEXT: lfdx f0, r3, r4
; CHECK-NEXT: add r6, r7, r6
; CHECK-NEXT: mtvsrd f0, r8
; CHECK-NEXT: xxswapd v1, vs0
; CHECK-NEXT: add r7, r3, r4
; CHECK-NEXT: xxpermdi v1, f0, f0, 2
; CHECK-NEXT: add r3, r7, r4
; CHECK-NEXT: vperm v6, v3, v1, v2
; CHECK-NEXT: vperm v1, v1, v3, v4
; CHECK-NEXT: xvnegsp v6, v6
Expand All @@ -61,8 +59,8 @@ define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 sig
; CHECK-NEXT: vadduwm v1, v1, v6
; CHECK-NEXT: xxspltw v6, v1, 2
; CHECK-NEXT: vadduwm v1, v1, v6
; CHECK-NEXT: vextuwrx r7, r5, v1
; CHECK-NEXT: add r6, r7, r6
; CHECK-NEXT: vextuwrx r8, r5, v1
; CHECK-NEXT: add r6, r8, r6
; CHECK-NEXT: bdnz .LBB0_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: extsw r3, r6
Expand All @@ -89,8 +87,7 @@ define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 sig
; P9BE-NEXT: xvnegsp v0, v1
; P9BE-NEXT: .p2align 4
; P9BE-NEXT: .LBB0_1: # %for.cond1.preheader
; P9BE-NEXT: # =>This Inner Loop Header: Depth=1
; P9BE-NEXT: lfd f0, 0(r3)
; P9BE: lfd f0, 0(r3)
; P9BE-NEXT: xxlor v1, vs0, vs0
; P9BE-NEXT: vperm v6, v3, v1, v4
; P9BE-NEXT: vperm v1, v3, v1, v2
Expand All @@ -104,10 +101,11 @@ define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 sig
; P9BE-NEXT: xxspltw v6, v1, 1
; P9BE-NEXT: vadduwm v1, v1, v6
; P9BE-NEXT: vextuwlx r7, r5, v1
; P9BE-NEXT: lfdx f0, r3, r4
; P9BE-NEXT: add r6, r7, r6
; P9BE-NEXT: ldux r7, r3, r4
; P9BE-NEXT: add r3, r3, r4
; P9BE-NEXT: mtvsrd v1, r7
; P9BE-NEXT: add r7, r3, r4
; P9BE-NEXT: xxlor v1, vs0, vs0
; P9BE-NEXT: add r3, r7, r4
; P9BE-NEXT: vperm v6, v3, v1, v2
; P9BE-NEXT: vperm v1, v3, v1, v4
; P9BE-NEXT: xvnegsp v6, v6
Expand All @@ -119,8 +117,8 @@ define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 sig
; P9BE-NEXT: vadduwm v1, v1, v6
; P9BE-NEXT: xxspltw v6, v1, 1
; P9BE-NEXT: vadduwm v1, v1, v6
; P9BE-NEXT: vextuwlx r7, r5, v1
; P9BE-NEXT: add r6, r7, r6
; P9BE-NEXT: vextuwlx r8, r5, v1
; P9BE-NEXT: add r6, r8, r6
; P9BE-NEXT: bdnz .LBB0_1
; P9BE-NEXT: # %bb.2: # %for.cond.cleanup
; P9BE-NEXT: extsw r3, r6
Expand Down Expand Up @@ -281,3 +279,119 @@ entry:
; return i_sum;
;}

define void @test32(i8* nocapture readonly %pix2, i32 signext %i_pix2) {
entry:
%idx.ext63 = sext i32 %i_pix2 to i64
%add.ptr64 = getelementptr inbounds i8, i8* %pix2, i64 %idx.ext63
%arrayidx5.1 = getelementptr inbounds i8, i8* %add.ptr64, i64 4
%0 = bitcast i8* %add.ptr64 to <4 x i8>*
%1 = load <4 x i8>, <4 x i8>* %0, align 1
%reorder_shuffle117 = shufflevector <4 x i8> %1, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%2 = zext <4 x i8> %reorder_shuffle117 to <4 x i32>
%3 = sub nsw <4 x i32> zeroinitializer, %2
%4 = bitcast i8* %arrayidx5.1 to <4 x i8>*
%5 = load <4 x i8>, <4 x i8>* %4, align 1
%reorder_shuffle115 = shufflevector <4 x i8> %5, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%6 = zext <4 x i8> %reorder_shuffle115 to <4 x i32>
%7 = sub nsw <4 x i32> zeroinitializer, %6
%8 = shl nsw <4 x i32> %7, <i32 16, i32 16, i32 16, i32 16>
%9 = add nsw <4 x i32> %8, %3
%10 = sub nsw <4 x i32> %9, zeroinitializer
%11 = shufflevector <4 x i32> undef, <4 x i32> %10, <4 x i32> <i32 2, i32 7, i32 0, i32 5>
%12 = add nsw <4 x i32> zeroinitializer, %11
%13 = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
store <4 x i32> %13, <4 x i32>* undef, align 16
ret void
; CHECK-LABEL: test32:
; CHECK-NOT: lwzux
; CHECK-NOT: mtvsrws
; CHECK: lfiwzx
; CHECK: lfiwzx
; P9BE-CHECK-LABEL: test32:
; P9BE-CHECK-NOT: lwzux
; P9BE-CHECK-NOT: mtvsrws
; P9BE-CHECK: lfiwzx
; P9BE-CHECK: lfiwzx
}

define void @test16(i16* nocapture readonly %sums, i32 signext %delta, i32 signext %thresh) {
entry:
%idxprom = sext i32 %delta to i64
%add14 = add nsw i32 %delta, 8
%idxprom15 = sext i32 %add14 to i64
br label %for.body

for.body: ; preds = %entry
%arrayidx8 = getelementptr inbounds i16, i16* %sums, i64 %idxprom
%0 = load i16, i16* %arrayidx8, align 2
%arrayidx16 = getelementptr inbounds i16, i16* %sums, i64 %idxprom15
%1 = load i16, i16* %arrayidx16, align 2
%2 = insertelement <4 x i16> undef, i16 %0, i32 2
%3 = insertelement <4 x i16> %2, i16 %1, i32 3
%4 = zext <4 x i16> %3 to <4 x i32>
%5 = sub nsw <4 x i32> zeroinitializer, %4
%6 = sub nsw <4 x i32> zeroinitializer, %5
%7 = select <4 x i1> undef, <4 x i32> %6, <4 x i32> %5
%bin.rdx = add <4 x i32> %7, zeroinitializer
%rdx.shuf54 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx55 = add <4 x i32> %bin.rdx, %rdx.shuf54
%8 = extractelement <4 x i32> %bin.rdx55, i32 0
%op.extra = add nuw i32 %8, 0
%cmp25 = icmp slt i32 %op.extra, %thresh
br i1 %cmp25, label %if.then, label %if.end

if.then: ; preds = %for.body
unreachable

if.end: ; preds = %for.body
ret void
; CHECK-LABEL: test16:
; CHECK-NOT: lhzux
; CHECK: lxsihzx
; CHECK: lxsihzx
; P9BE-CHECK-LABEL: test16:
; P9BE-CHECK-NOT: lhzux
; P9BE-CHECK: lxsihzx
; P9BE-CHECK: lxsihzx
}

define void @test8(i8* nocapture readonly %sums, i32 signext %delta, i32 signext %thresh) {
entry:
%idxprom = sext i32 %delta to i64
%add14 = add nsw i32 %delta, 8
%idxprom15 = sext i32 %add14 to i64
br label %for.body

for.body: ; preds = %entry
%arrayidx8 = getelementptr inbounds i8, i8* %sums, i64 %idxprom
%0 = load i8, i8* %arrayidx8, align 2
%arrayidx16 = getelementptr inbounds i8, i8* %sums, i64 %idxprom15
%1 = load i8, i8* %arrayidx16, align 2
%2 = insertelement <4 x i8> undef, i8 %0, i32 2
%3 = insertelement <4 x i8> %2, i8 %1, i32 3
%4 = zext <4 x i8> %3 to <4 x i32>
%5 = sub nsw <4 x i32> zeroinitializer, %4
%6 = sub nsw <4 x i32> zeroinitializer, %5
%7 = select <4 x i1> undef, <4 x i32> %6, <4 x i32> %5
%bin.rdx = add <4 x i32> %7, zeroinitializer
%rdx.shuf54 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx55 = add <4 x i32> %bin.rdx, %rdx.shuf54
%8 = extractelement <4 x i32> %bin.rdx55, i32 0
%op.extra = add nuw i32 %8, 0
%cmp25 = icmp slt i32 %op.extra, %thresh
br i1 %cmp25, label %if.then, label %if.end

if.then: ; preds = %for.body
unreachable

if.end: ; preds = %for.body
ret void
; CHECK-LABEL: test8:
; CHECK-NOT: lbzux
; CHECK: lxsibzx
; CHECK: lxsibzx
; P9BE-CHECK-LABEL: test8:
; P9BE-CHECK-NOT: lbzux
; P9BE-CHECK: lxsibzx
; P9BE-CHECK: lxsibzx
}

0 comments on commit 728e139

Please sign in to comment.