diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -15202,13 +15202,17 @@ } } break; - case ISD::BSWAP: + case ISD::BSWAP: { // Turn BSWAP (LOAD) -> lhbrx/lwbrx. - if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && - N->getOperand(0).hasOneUse() && + // For subtargets without LDBRX, we can still do better than the default + // expansion even for 64-bit BSWAP (LOAD). + bool Is64BitBswapOn64BitTgt = + Subtarget.isPPC64() && N->getValueType(0) == MVT::i64; + bool IsSingleUseNormalLd = ISD::isNormalLoad(N->getOperand(0).getNode()) && + N->getOperand(0).hasOneUse(); + if (IsSingleUseNormalLd && (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 || - (Subtarget.hasLDBRX() && Subtarget.isPPC64() && - N->getValueType(0) == MVT::i64))) { + (Subtarget.hasLDBRX() && Is64BitBswapOn64BitTgt))) { SDValue Load = N->getOperand(0); LoadSDNode *LD = cast(Load); // Create the byte-swapping load. @@ -15239,7 +15243,32 @@ // Return N so it doesn't get rechecked! return SDValue(N, 0); } - break; + // Convert this to two 32-bit bswap loads and a BUILD_PAIR. Do this only + // before legalization so that the BUILD_PAIR is handled correctly. + if (!DCI.isBeforeLegalize() || !Is64BitBswapOn64BitTgt || + !IsSingleUseNormalLd) + return SDValue(); + LoadSDNode *LD = cast(N->getOperand(0)); + + // Can't split volatile or atomic loads. + if (!LD->isSimple()) + return SDValue(); + SDValue BasePtr = LD->getBasePtr(); + SDValue Lo = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr, + LD->getPointerInfo(), LD->getAlignment()); + Lo = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Lo); + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getIntPtrConstant(4, dl)); + SDValue Hi = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr, + LD->getPointerInfo(), LD->getAlignment()); + Hi = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Hi); + SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Hi, Lo); + SDValue TF = + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + Hi.getOperand(0).getValue(1), Lo.getOperand(0).getValue(1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), TF); + return Res; + } case PPCISD::VCMP: // If a VCMP_rec node already exists with exactly the same operands as this // node, use its result instead of this node (VCMP_rec computes both a CR6 diff --git a/llvm/test/CodeGen/PowerPC/bswap-load-store.ll b/llvm/test/CodeGen/PowerPC/bswap-load-store.ll --- a/llvm/test/CodeGen/PowerPC/bswap-load-store.ll +++ b/llvm/test/CodeGen/PowerPC/bswap-load-store.ll @@ -101,6 +101,8 @@ ret i16 %tmp6 } +; TODO: combine the bswap feeding a store on subtargets +; that do not have an STDBRX. define void @STDBRX(i64 %i, i8* %ptr, i64 %off) { ; PWR7_32-LABEL: STDBRX: ; PWR7_32: # %bb.0: @@ -149,19 +151,11 @@ ; ; X64-LABEL: LDBRX: ; X64: # %bb.0: -; X64-NEXT: ldx r4, r3, r4 -; X64-NEXT: rotldi r5, r4, 16 -; X64-NEXT: rotldi r3, r4, 8 -; X64-NEXT: rldimi r3, r5, 8, 48 -; X64-NEXT: rotldi r5, r4, 24 -; X64-NEXT: rldimi r3, r5, 16, 40 -; X64-NEXT: rotldi r5, r4, 32 -; X64-NEXT: rldimi r3, r5, 24, 32 -; X64-NEXT: rotldi r5, r4, 48 -; X64-NEXT: rldimi r3, r5, 40, 16 -; X64-NEXT: rotldi r5, r4, 56 -; X64-NEXT: rldimi r3, r5, 48, 8 -; X64-NEXT: rldimi r3, r4, 56, 0 +; X64-NEXT: li r5, 4 +; X64-NEXT: lwbrx r6, r3, r4 +; X64-NEXT: add r3, r3, r4 +; X64-NEXT: lwbrx r3, r3, r5 +; X64-NEXT: rldimi r3, r6, 32, 0 ; X64-NEXT: blr ; ; PWR7_64-LABEL: LDBRX: diff --git a/llvm/test/CodeGen/PowerPC/ld-bswap64-no-ldbrx.ll b/llvm/test/CodeGen/PowerPC/ld-bswap64-no-ldbrx.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/ld-bswap64-no-ldbrx.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=powerpc64-- -mcpu=pwr5 -verify-machineinstrs < %s | \ +; RUN: FileCheck %s +define void @bs(i64* %p) { +; CHECK-LABEL: bs: +; CHECK: # %bb.0: +; CHECK-NEXT: li 4, 4 +; CHECK-NEXT: lwbrx 5, 0, 3 +; CHECK-NEXT: lwbrx 4, 3, 4 +; CHECK-NEXT: rldimi 4, 5, 32, 0 +; CHECK-NEXT: std 4, 0(3) +; CHECK-NEXT: blr + %x = load i64, i64* %p, align 8 + %b = call i64 @llvm.bswap.i64(i64 %x) + store i64 %b, i64* %p, align 8 + ret void +} + +define i64 @volatile_ld(i64* %p) { +; CHECK-LABEL: volatile_ld: +; CHECK: # %bb.0: +; CHECK-NEXT: ld 4, 0(3) +; CHECK-NEXT: rotldi 5, 4, 16 +; CHECK-NEXT: rotldi 3, 4, 8 +; CHECK-NEXT: rldimi 3, 5, 8, 48 +; CHECK-NEXT: rotldi 5, 4, 24 +; CHECK-NEXT: rldimi 3, 5, 16, 40 +; CHECK-NEXT: rotldi 5, 4, 32 +; CHECK-NEXT: rldimi 3, 5, 24, 32 +; CHECK-NEXT: rotldi 5, 4, 48 +; CHECK-NEXT: rldimi 3, 5, 40, 16 +; CHECK-NEXT: rotldi 5, 4, 56 +; CHECK-NEXT: rldimi 3, 5, 48, 8 +; CHECK-NEXT: rldimi 3, 4, 56, 0 +; CHECK-NEXT: blr + %x = load volatile i64, i64* %p, align 8 + %b = call i64 @llvm.bswap.i64(i64 %x) + ret i64 %b +} + +define i64 @misaligned_ld(i64* %p) { +; CHECK-LABEL: misaligned_ld: +; CHECK: # %bb.0: +; CHECK-NEXT: li 4, 4 +; CHECK-NEXT: lwbrx 5, 0, 3 +; CHECK-NEXT: lwbrx 3, 3, 4 +; CHECK-NEXT: rldimi 3, 5, 32, 0 +; CHECK-NEXT: blr + %x = load i64, i64* %p, align 1 + %b = call i64 @llvm.bswap.i64(i64 %x) + ret i64 %b +} + +declare i64 @llvm.bswap.i64(i64) #2