Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -181,12 +181,20 @@ } bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { - if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 && - T->getIntegerBitWidth() <= 16) + const IntegerType *IntTy = dyn_cast(T); + if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) return true; - if (!T->isVectorTy()) - return false; - return needsPromotionToI32(cast(T)->getElementType()); + + if (const VectorType *VT = dyn_cast(T)) { + // TODO: The set of packed operations is more limited, so may want to + // promote some anyway. + if (ST->hasVOP3PInsts()) + return false; + + return needsPromotionToI32(VT->getElementType()); + } + + return false; } // Return true if the op promoted to i32 should have nsw set. Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -159,6 +159,10 @@ SDValue &Clamp, SDValue &Omod) const; + bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp) const; + void SelectADD_SUB_I64(SDNode *N); void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); @@ -305,6 +309,20 @@ llvm_unreachable("invalid vector size"); } +static bool getConstantValue(SDValue N, uint32_t &Out) { + if (const ConstantSDNode *C = dyn_cast(N)) { + Out = C->getAPIntValue().getZExtValue(); + return true; + } + + if (const ConstantFPSDNode *C = dyn_cast(N)) { + Out = C->getValueAPF().bitcastToAPInt().getZExtValue(); + return true; + } + + return false; +} + void AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -356,7 +374,24 @@ EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); + + if (VT == MVT::v2i16 || VT == MVT::v2f16) { + if (Opc == ISD::BUILD_VECTOR) { + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + uint32_t K = LHSVal | (RHSVal << 16); + CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT, + CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32)); + return; + } + } + + break; + } + assert(EltVT.bitsEq(MVT::i32)); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { RegClassID = selectSGPRVectorRegClassID(NumVectorElts); } else { @@ -1565,7 +1600,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - Src = In; if (Src.getOpcode() == ISD::FNEG) { @@ -1579,7 +1613,6 @@ } SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); - return true; } @@ -1633,6 +1666,38 @@ return SelectVOP3Mods(In, Src, SrcMods); } +bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + Src = In; + + // FIXME: Look for on separate components + if (Src.getOpcode() == ISD::FNEG) { + Mods |= (SISrcMods::NEG | SISrcMods::NEG_HI); + Src = Src.getOperand(0); + } + + // Packed instructions do not have abs modifiers. + + // FIXME: Handle abs/neg of individual components. + // FIXME: Handle swizzling with op_sel + Mods |= SISrcMods::OP_SEL_1; + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp) const { + SDLoc SL(In); + + // FIXME: Handle clamp and op_sel + Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); + + return SelectVOP3PMods(In, Src, SrcMods); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast(getTargetLowering()); Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -644,12 +644,17 @@ bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() && - VT == MVT::f16); + + // Packed operations do not have a fabs modifier. + return VT == MVT::f32 || VT == MVT::f64 || + (Subtarget->has16BitInsts() && VT == MVT::f16); } bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { - return isFAbsFree(VT); + assert(VT.isFloatingPoint()); + return VT == MVT::f32 || VT == MVT::f64 || + (Subtarget->has16BitInsts() && VT == MVT::f16) || + (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16); } bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -160,6 +160,12 @@ // Misc. PatFrags //===----------------------------------------------------------------------===// +class HasOneUseUnaryOp : PatFrag< + (ops node:$src0), + (op $src0), + [{ return N->hasOneUse(); }] +>; + class HasOneUseBinOp : PatFrag< (ops node:$src0, node:$src1), (op $src0, $src1), @@ -422,6 +428,7 @@ int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding int FP16_ONE = 0x3C00; +int V2FP16_ONE = 0x3C003C00; int FP32_ONE = 0x3f800000; int FP32_NEG_ONE = 0xbf800000; int FP64_ONE = 0x3ff0000000000000; @@ -630,6 +637,8 @@ (med3Inst $src0, $src1, $src2) >; +def trunc_oneuse : HasOneUseUnaryOp; + let Properties = [SDNPCommutative, SDNPAssociative] in { def smax_oneuse : HasOneUseBinOp; def smin_oneuse : HasOneUseBinOp; @@ -643,6 +652,8 @@ } // Properties = [SDNPCommutative, SDNPAssociative] def sub_oneuse : HasOneUseBinOp; +def srl_oneuse : HasOneUseBinOp; +def shl_oneuse : HasOneUseBinOp; def select_oneuse : HasOneUseTernaryOp