Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -10566,6 +10566,20 @@ (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 || LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32)) return expandVSXLoadForLE(N, DCI); + + // When we load a v4i8, the code can degrade rather quickly. Convert + // this to an i32 load and bitcast. + if (LoadVT == MVT::v4i8) { + SDValue ScalarLoad = DAG.getLoad(MVT::i32, dl, LD->getChain(), + LD->getBasePtr(), LD->getPointerInfo(), + false, LD->isNonTemporal(), + LD->isInvariant(), LD->getAlignment(), + LD->getAAInfo()); + SDValue BitCast = DAG.getBitcast(MVT::v4i8, ScalarLoad); + return DAG.getNode(ISD::MERGE_VALUES, dl, + DAG.getVTList(MVT::v4i8, MVT::Other), + BitCast, ScalarLoad.getValue(1)); + } } // We sometimes end up with a 64-bit integer load, from which we extract Index: test/CodeGen/PowerPC/load-v4i8-improved.ll =================================================================== --- test/CodeGen/PowerPC/load-v4i8-improved.ll +++ test/CodeGen/PowerPC/load-v4i8-improved.ll @@ -0,0 +1,23 @@ +; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s \ +; RUN: --check-prefix=CHECK-BE + +define <16 x i8> @test(i32* %s, i32* %t) { +entry: + %0 = bitcast i32* %s to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 4 + %2 = shufflevector <4 x i8> %1, <4 x i8> undef, <16 x i32> + ret <16 x i8> %2 +; CHECK: lwz [[GPR:[0-9]+]], 0(3) +; CHECK: mtvsrd [[VSR:[0-9]+]], [[GPR]] +; CHECK: xxswapd [[SWP:[0-9]+]], [[VSR]] +; CHECK: xxspltw 34, [[SWP]], 3 +; CHECK-NOT: vmrg +; CHECK-NOT: vperm +; CHECK-BE: lwz [[GPR:[0-9]+]], 0(3) +; CHECK-BE: sldi [[SHL:[0-9]+]], [[GPR]], 32 +; CHECK-BE: mtvsrd [[VSR:[0-9]+]], [[SHL]] +; CHECK-BE: xxspltw 34, [[VSR]], 0 +; CHECK-BE-NOT: vmrg +; CHECK-BE-NOT: vperm +}