Index: llvm/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -953,6 +953,8 @@ SDValue LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const; + SDValue LowerTRUNCATEVector(SDValue Op, SelectionDAG &DAG) const; + SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const; SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const; Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -119,6 +119,8 @@ static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); +static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl); + // FIXME: Remove this once the bug has been fixed! extern cl::opt ANDIGlueBug; @@ -640,6 +642,12 @@ // with merges, splats, etc. setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); + setOperationAction(ISD::AND , MVT::v4i32, Legal); setOperationAction(ISD::OR , MVT::v4i32, Legal); setOperationAction(ISD::XOR , MVT::v4i32, Legal); @@ -6890,6 +6898,60 @@ Op.getOperand(0)); } +SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op, + SelectionDAG &DAG) const { + + // Implements a vector truncate that fits in a vector register as a shuffle. + // We want to legalize vector truncates down to where the source fits in + // a vector register (and target is therefore smaller than vector register + // size). At that point legalization will try to custom lower the sub-legal + // result and get here - where we can contain the truncate as a single target + // operation. + + // For example a trunc <2 x i16> to <2 x i8> could be visualized as follows: + // [MSB1 LSB1][MSB2][MSB2 LSB2] to [LSB1][LSB2] + // + // We will implement it for big-endian ordering as this (where x denotes + // undefined): + // [MSB1 LSB1][MSB2 LSB2][x x][x x][x x][x x][x x][x x] to + // [LSB1][LSB2][x][x][x][x][x][x][x][x][x][x][x][x][x][x] + // + // The same operation in little-endian ordering will be: + // [LSB1 MSB1][LSB2 MSB2][x x][x x][x x][x x][x x][x x] to + // [LSB1][LSB2][x][x][x][x][x][x][x][x][x][x][x][x][x][x] + + assert(Op.getValueType().isVector() && "Vector type expected."); + + SDLoc DL(Op); + SDValue N1 = Op.getOperand(0); + unsigned SrcSize = N1.getValueType().getSizeInBits(); + assert(SrcSize <= 128 && "Source must be no wider than a legal PPC vector"); + SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL); + + EVT TrgVT = Op.getValueType(); + unsigned TrgNumElts = TrgVT.getVectorNumElements(); + EVT EltVT = TrgVT.getVectorElementType(); + unsigned WideNumElts = 128 / EltVT.getSizeInBits(); + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts); + + // First list the elements we want to keep. + unsigned SizeMult = SrcSize / TrgVT.getSizeInBits(); + SmallVector ShuffV; + if (Subtarget.isLittleEndian()) + for (unsigned i = 0; i < TrgNumElts; ++i) + ShuffV.push_back(i * SizeMult); + else + for (unsigned i = 1; i <= TrgNumElts; ++i) + ShuffV.push_back(i * SizeMult - 1); + + // Populate the remaining elements with undefs. + for (unsigned i = TrgNumElts; i < WideNumElts; ++i) + ShuffV.push_back(i + WideNumElts); + + SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc); + return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV); +} + /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when /// possible. SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { @@ -9737,6 +9799,14 @@ return; Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); return; + case ISD::TRUNCATE: { + EVT TrgVT = N->getValueType(0); + if (TrgVT.isVector() && + isOperationCustom(N->getOpcode(), TrgVT) && + N->getOperand(0).getValueType().getSizeInBits() <= 128) + Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG)); + return; + } case ISD::BITCAST: // Don't handle bitcast here. return; Index: llvm/test/CodeGen/PowerPC/vec-trunc.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/PowerPC/vec-trunc.ll @@ -0,0 +1,90 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE + +define void @test8i8(<8 x i8>* nocapture %Sink, <8 x i16>* nocapture readonly %SrcPtr) { +entry: + %0 = load <8 x i16>, <8 x i16>* %SrcPtr, align 16 + %1 = trunc <8 x i16> %0 to <8 x i8> + store <8 x i8> %1, <8 x i8>* %Sink, align 16 + ret void +} +; CHECK-LABEL: @test8i8 +; CHECK: vpkuhum +; CHECK-BE-LABEL: @test8i8 +; CHECK-BE: vpkuhum + +define void @test4i8(<4 x i8>* nocapture %Sink, <4 x i16>* nocapture readonly %SrcPtr) { +entry: + %0 = load <4 x i16>, <4 x i16>* %SrcPtr, align 16 + %1 = trunc <4 x i16> %0 to <4 x i8> + store <4 x i8> %1, <4 x i8>* %Sink, align 16 + ret void +} +; CHECK-LABEL: @test4i8 +; CHECK: vpkuhum +; CHECK-BE-LABEL: @test4i8 +; CHECK-BE: vpkuhum + +define void @test4i8w(<4 x i8>* nocapture %Sink, <4 x i32>* nocapture readonly %SrcPtr) { +entry: + %0 = load <4 x i32>, <4 x i32>* %SrcPtr, align 16 + %1 = trunc <4 x i32> %0 to <4 x i8> + store <4 x i8> %1, <4 x i8>* %Sink, align 16 + ret void +} +; CHECK-LABEL: @test4i8w +; CHECK: vperm +; CHECK-BE-LABEL: @test4i8w +; CHECK-BE: vperm + +define void @test2i8(<2 x i8>* nocapture %Sink, <2 x i16>* nocapture readonly %SrcPtr) { +entry: + %0 = load <2 x i16>, <2 x i16>* %SrcPtr, align 16 + %1 = trunc <2 x i16> %0 to <2 x i8> + store <2 x i8> %1, <2 x i8>* %Sink, align 16 + ret void +} +; CHECK-LABEL: @test2i8 +; CHECK: vpkuhum +; CHECK-BE-LABEL: @test2i8 +; CHECK-BE: vpkuhum + +define void @test4i16(<4 x i16>* nocapture %Sink, <4 x i32>* nocapture readonly %SrcPtr) { +entry: + %0 = load <4 x i32>, <4 x i32>* %SrcPtr, align 16 + %1 = trunc <4 x i32> %0 to <4 x i16> + store <4 x i16> %1, <4 x i16>* %Sink, align 16 + ret void +} +; CHECK-LABEL: @test4i16 +; CHECK: vpkuwum +; CHECK-BE-LABEL: @test4i16 +; CHECK-BE: vpkuwum + +define void @test2i16(<2 x i16>* nocapture %Sink, <2 x i32>* nocapture readonly %SrcPtr) { +entry: + %0 = load <2 x i32>, <2 x i32>* %SrcPtr, align 16 + %1 = trunc <2 x i32> %0 to <2 x i16> + store <2 x i16> %1, <2 x i16>* %Sink, align 16 + ret void +} +; CHECK-LABEL: @test2i16 +; CHECK: vpkuwum +; CHECK-BE-LABEL: @test2i16 +; CHECK-BE: vpkuwum + +define void @test2i16d(<2 x i16>* nocapture %Sink, <2 x i64>* nocapture readonly %SrcPtr) { +entry: + %0 = load <2 x i64>, <2 x i64>* %SrcPtr, align 16 + %1 = trunc <2 x i64> %0 to <2 x i16> + store <2 x i16> %1, <2 x i16>* %Sink, align 16 + ret void +} +; CHECK-LABEL: @test2i16d +; CHECK: vperm +; CHECK-BE-LABEL: @test2i16d +; CHECK-BE: vperm