Index: docs/CommandGuide/llvm-symbolizer.rst =================================================================== --- docs/CommandGuide/llvm-symbolizer.rst +++ docs/CommandGuide/llvm-symbolizer.rst @@ -61,11 +61,14 @@ ------- .. option:: -obj + Path to object file to be symbolized. -.. option:: -functions +.. option:: -functions=[none|short|linkage] - Print function names as well as source file/line locations. Defaults to true. + Specify the way function names are printed (omit function name, + print short function name, or print full linkage name, respectively). + Defaults to ``linkage``. .. option:: -use-symbol-table Index: include/llvm/CodeGen/MachineOperand.h =================================================================== --- include/llvm/CodeGen/MachineOperand.h +++ include/llvm/CodeGen/MachineOperand.h @@ -42,7 +42,7 @@ /// class MachineOperand { public: - enum MachineOperandType { + enum MachineOperandType : unsigned char { MO_Register, ///< Register operand. MO_Immediate, ///< Immediate operand MO_CImmediate, ///< Immediate >64bit operand @@ -65,7 +65,7 @@ private: /// OpKind - Specify what kind of operand this is. This discriminates the /// union. - unsigned char OpKind; // MachineOperandType + MachineOperandType OpKind; /// Subregister number for MO_Register. A value of 0 indicates the /// MO_Register has no subReg. Index: include/llvm/DebugInfo/DIContext.h =================================================================== --- include/llvm/DebugInfo/DIContext.h +++ include/llvm/DebugInfo/DIContext.h @@ -70,7 +70,7 @@ /// should be filled with data. struct DILineInfoSpecifier { enum class FileLineInfoKind { None, Default, AbsoluteFilePath }; - enum class FunctionNameKind { None, LinkageName }; + enum class FunctionNameKind { None, ShortName, LinkageName }; FileLineInfoKind FLIKind; FunctionNameKind FNKind; Index: lib/DebugInfo/DWARFDebugInfoEntry.cpp =================================================================== --- lib/DebugInfo/DWARFDebugInfoEntry.cpp +++ lib/DebugInfo/DWARFDebugInfoEntry.cpp @@ -277,13 +277,15 @@ FunctionNameKind Kind) const { if (!isSubroutineDIE() || Kind == FunctionNameKind::None) return nullptr; - // Try to get mangled name if possible. - if (const char *name = - getAttributeValueAsString(U, DW_AT_MIPS_linkage_name, nullptr)) - return name; - if (const char *name = getAttributeValueAsString(U, DW_AT_linkage_name, - nullptr)) - return name; + // Try to get mangled name only if it was asked for. + if (Kind == FunctionNameKind::LinkageName) { + if (const char *name = + getAttributeValueAsString(U, DW_AT_MIPS_linkage_name, nullptr)) + return name; + if (const char *name = + getAttributeValueAsString(U, DW_AT_linkage_name, nullptr)) + return name; + } if (const char *name = getAttributeValueAsString(U, DW_AT_name, nullptr)) return name; // Try to get name from specification DIE. Index: lib/Target/PowerPC/PPCAsmPrinter.cpp =================================================================== --- lib/Target/PowerPC/PPCAsmPrinter.cpp +++ lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -208,7 +208,7 @@ } default: - O << ""; + O << ""; return; } } Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -884,6 +884,7 @@ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const; SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1068,11 +1068,14 @@ // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); - setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); + setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); + setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); + setOperationAction(ISD::VSELECT, MVT::v4i32, Custom); + setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); + setOperationAction(ISD::VSELECT, MVT::v8i16, Custom); + // There is no BLENDI for byte vectors. We don't need to custom lower + // some vselects for now. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); - setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are @@ -1188,10 +1191,10 @@ setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); - setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); - setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + setOperationAction(ISD::VSELECT, MVT::v4f64, Custom); + setOperationAction(ISD::VSELECT, MVT::v4i64, Custom); + setOperationAction(ISD::VSELECT, MVT::v8i32, Custom); + setOperationAction(ISD::VSELECT, MVT::v8f32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); @@ -1236,6 +1239,7 @@ setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); + setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); @@ -4694,11 +4698,17 @@ return getInsertVINSERTImmediate(N, 256); } +/// isZero - Returns true if Elt is a constant integer zero +static bool isZero(SDValue V) { + ConstantSDNode *C = dyn_cast(V); + return C && C->isNullValue(); +} + /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { - if (ConstantSDNode *CN = dyn_cast(Elt)) - return CN->isNullValue(); + if (isZero(Elt)) + return true; if (ConstantFPSDNode *CFP = dyn_cast(Elt)) return CFP->getValueAPF().isPosZero(); return false; @@ -7402,6 +7412,23 @@ getShuffleSHUFImmediate(SVOp), DAG); } +static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, + SelectionDAG &DAG) { + SDLoc dl(Load); + MVT VT = Load->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + SDValue Addr = Load->getOperand(1); + SDValue NewAddr = DAG.getNode( + ISD::ADD, dl, Addr.getSimpleValueType(), Addr, + DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType())); + + SDValue NewLoad = + DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Load->getMemOperand(), 0, EVT.getStoreSize())); + return NewLoad; +} + // It is only safe to call this function if isINSERTPSMask is true for // this shufflevector mask. static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, @@ -7413,7 +7440,6 @@ // If we're transferring an i32 from memory to a specific element in a // register, we output a generic DAG that will match the PINSRD // instruction. - // TODO: Optimize for AVX cases too (VINSERTPS) MVT VT = SVOp->getSimpleValueType(0); MVT EVT = VT.getVectorElementType(); SDValue V1 = SVOp->getOperand(0); @@ -7446,17 +7472,10 @@ // Trivial case, when From comes from a load and is only used by the // shuffle. Make it use insertps from the vector that we need from that // load. - SDValue Addr = From.getOperand(1); - SDValue NewAddr = - DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr, - DAG.getConstant(DestIndex * EVT.getStoreSize(), - Addr.getSimpleValueType())); - - LoadSDNode *Load = cast(From); SDValue NewLoad = - DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, - DAG.getMachineFunction().getMachineMemOperand( - Load->getMemOperand(), 0, EVT.getStoreSize())); + NarrowVectorLoadToElement(cast(From), DestIndex, DAG); + if (!NewLoad.getNode()) + return SDValue(); if (EVT == MVT::f32) { // Create this as a scalar to vector to match the instruction pattern. @@ -7961,6 +7980,105 @@ return SDValue(); } +// This function assumes its argument is a BUILD_VECTOR of constand or +// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is +// true. +static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, + unsigned &MaskValue) { + MaskValue = 0; + unsigned NumElems = BuildVector->getNumOperands(); + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + unsigned NumLanes = (NumElems - 1) / 8 + 1; + unsigned NumElemsInLane = NumElems / NumLanes; + + // Blend for v16i16 should be symetric for the both lanes. + for (unsigned i = 0; i < NumElemsInLane; ++i) { + SDValue EltCond = BuildVector->getOperand(i); + SDValue SndLaneEltCond = + (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond; + + int Lane1Cond = -1, Lane2Cond = -1; + if (isa(EltCond)) + Lane1Cond = !isZero(EltCond); + if (isa(SndLaneEltCond)) + Lane2Cond = !isZero(SndLaneEltCond); + + if (Lane1Cond == Lane2Cond || Lane2Cond < 0) + MaskValue |= !!Lane1Cond << i; + else if (Lane1Cond < 0) + MaskValue |= !!Lane2Cond << i; + else + return false; + } + return true; +} + +// Try to lower a vselect node into a simple blend instruction. +static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue Cond = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) + return SDValue(); + + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + return SDValue(); + + // Check the mask for BLEND and build the value. + unsigned MaskValue = 0; + if (!BUILD_VECTORtoBlendMask(cast(Cond), MaskValue)) + return SDValue(); + + // Convert i32 vectors to floating point if it is not AVX2. + // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. + MVT BlendVT = VT; + if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { + BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), + NumElems); + LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS); + RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS); + } + + SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS, + DAG.getConstant(MaskValue, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Ret); +} + +SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { + SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG); + if (BlendOp.getNode()) + return BlendOp; + + // Some types for vselect were previously set to Expand, not Legal or + // Custom. Return an empty SDValue so we fall-through to Expand, after + // the Custom lowering phase. + MVT VT = Op.getSimpleValueType(); + switch (VT.SimpleTy) { + default: + break; + case MVT::v8i16: + case MVT::v16i16: + return SDValue(); + } + + // We couldn't create a "Blend with immediate" node. + // This node should still be legal, but we'll have to emit a blendv* + // instruction. + return Op; +} + static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -10746,11 +10864,6 @@ return false; } -static bool isZero(SDValue V) { - ConstantSDNode *C = dyn_cast(V); - return C && C->isNullValue(); -} - static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) return false; @@ -14326,6 +14439,7 @@ case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); @@ -17631,6 +17745,51 @@ return std::make_pair(Opc, NeedSplit); } +static SDValue +TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc dl(N); + SDValue Cond = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + + if (Cond.getOpcode() == ISD::SIGN_EXTEND) { + SDValue CondSrc = Cond->getOperand(0); + if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG) + Cond = CondSrc->getOperand(0); + } + + MVT VT = N->getSimpleValueType(0); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) + return SDValue(); + + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + return SDValue(); + + unsigned MaskValue = 0; + if (!BUILD_VECTORtoBlendMask(cast(Cond), MaskValue)) + return SDValue(); + + SmallVector ShuffleMask(NumElems, -1); + for (unsigned i = 0; i < NumElems; ++i) { + // Be sure we emit undef where we can. + if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF) + ShuffleMask[i] = -1; + else + ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); + } + + return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); +} + /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT /// nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, @@ -18139,7 +18298,13 @@ // depend on the highest bit in each word. Try to use SimplifyDemandedBits // to simplify previous instructions. if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && - !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { + !DCI.isBeforeLegalize() && + // We explicitly check against v8i16 and v16i16 because, although + // they're marked as Custom, they might only be legal when Cond is a + // build_vector of constants. This will be taken care in a later + // condition. + (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 && + VT != MVT::v8i16)) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); // Don't optimize vector selects that map to mask-registers. @@ -18166,6 +18331,23 @@ DCI.CommitTargetLoweringOpt(TLO); } + // We should generate an X86ISD::BLENDI from a vselect if its argument + // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of + // constants. This specific pattern gets generated when we split a + // selector for a 512 bit vector in a machine without AVX512 (but with + // 256-bit vectors), during legalization: + // + // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) + // + // Iff we find this pattern and the build_vectors are built from + // constants, we translate the vselect into a shuffle_vector that we + // know will be matched by LowerVECTOR_SHUFFLEtoBlend. + if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) { + SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + } + return SDValue(); } @@ -20093,6 +20275,29 @@ return SDValue(); } +static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc dl(N); + MVT VT = N->getOperand(1)->getSimpleValueType(0); + assert(VT == MVT::v4f32 || + VT == MVT::v4i32 && "X86insertps is only defined for v4x32"); + + SDValue Ld = N->getOperand(1); + if (MayFoldLoad(Ld)) { + unsigned DestIndex = + cast(N->getOperand(2))->getZExtValue() >> 6; + Ld = NarrowVectorLoadToElement(cast(Ld), DestIndex, DAG); + } else + return SDValue(); + + // Create this as a scalar to vector to match the instruction pattern. + SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); + // countS bits are ignored when loading from memory on insertps, which + // means we don't need to explicitly set them to 0. + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), + LoadScalarToVector, N->getOperand(2)); +} + // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. @@ -20396,6 +20601,8 @@ case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); + case X86ISD::INSERTPS: + return PerformINSERTPSCombine(N, DAG, Subtarget); } return SDValue(); Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -6550,6 +6550,27 @@ defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>; } +let Predicates = [UseSSE41] in + // If we're inserting an element from a load or a null pshuf of a load, + // fold the load into the insertps instruction. + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32 + (scalar_to_vector (loadf32 addr:$src2))), (i8 0)), + imm:$src3)), + (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd + (loadv4f32 addr:$src2), (i8 0)), imm:$src3)), + (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + +let Predicates = [UseAVX] in + // If we're inserting an element from a vbroadcast of a load, fold the + // load into the X86insertps instruction. + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), + (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), + (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), + (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)), + (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + //===----------------------------------------------------------------------===// // SSE4.1 - Round Instructions //===----------------------------------------------------------------------===// Index: lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- lib/Transforms/InstCombine/InstructionCombining.cpp +++ lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1220,6 +1220,65 @@ if (MadeChange) return &GEP; } + // Check to see if the inputs to the PHI node are getelementptr instructions. + if (PHINode *PN = dyn_cast(PtrOp)) { + GetElementPtrInst *Op1 = dyn_cast(PN->getOperand(0)); + if (!Op1) + return nullptr; + + signed DI = -1; + + for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) { + GetElementPtrInst *Op2 = dyn_cast(*I); + if (!Op2 || Op1->getNumOperands() != Op1->getNumOperands()) + return nullptr; + + for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) { + if (Op1->getOperand(J) != Op2->getOperand(J)) { + if (DI == -1) { + // We have not seen any differences yet in the GEPs feeding the + // PHI yet, so we record this one. + DI = J; + } else { + // The GEP is different by more than one input. While this could be + // extended to support GEPs that vary by more than one variable it + // doesn't make sense since it greatly increases the complexity and + // would result in an R+R+R addressing mode which no backend + // directly supports and would need to be broken into several + // simpler instructions anyway. + return nullptr; + } + } + } + } + + GetElementPtrInst *NewGEP = dyn_cast(Op1->clone()); + + if (DI == -1) { + // All the GEPs feeding the PHI are identical. Clone one down into our + // BB so that it can be merged with the current GEP. + GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(), + NewGEP); + } else { + // All the GEPs feeding the PHI differ at a single offset. Clone a GEP + // into the current block so it can be merged, and create a new PHI to + // set that index. + PHINode *NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(), + PN->getNumOperands()); + for (auto &I : PN->operands()) + NewPN->addIncoming(dyn_cast(I)->getOperand(DI), + PN->getIncomingBlock(I)); + + NewGEP->setOperand(DI, NewPN); + GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(), + NewGEP); + NewGEP->setOperand(DI, NewPN); + } + + GEP.setOperand(0, NewGEP); + PtrOp = NewGEP; + } + // Combine Indices - If the source pointer to this getelementptr instruction // is a getelementptr instruction, combine the indices of the two // getelementptr instructions into a single instruction. Index: test/Analysis/CostModel/X86/vselect-cost.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/X86/vselect-cost.ll @@ -0,0 +1,126 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 + + +; Verify the cost of vector select instructions. + +; SSE41 added blend instructions with an immediate for <2 x double> and +; <4 x float>. Integers of the same size should also use those instructions. + +define <2 x i64> @test_2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_2i64': +; SSE2: Cost Model: {{.*}} 4 for instruction: %sel = select <2 x i1> +; SSE41: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> +; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> +; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> + %sel = select <2 x i1> , <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %sel +} + +define <2 x double> @test_2double(<2 x double> %a, <2 x double> %b) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_2double': +; SSE2: Cost Model: {{.*}} 3 for instruction: %sel = select <2 x i1> +; SSE41: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> +; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> +; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> + %sel = select <2 x i1> , <2 x double> %a, <2 x double> %b + ret <2 x double> %sel +} + +define <4 x i32> @test_4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4i32': +; SSE2: Cost Model: {{.*}} 8 for instruction: %sel = select <4 x i1> +; SSE41: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> +; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> +; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> + %sel = select <4 x i1> , <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %sel +} + +define <4 x float> @test_4float(<4 x float> %a, <4 x float> %b) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4float': +; SSE2: Cost Model: {{.*}} 7 for instruction: %sel = select <4 x i1> +; SSE41: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> +; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> +; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> + %sel = select <4 x i1> , <4 x float> %a, <4 x float> %b + ret <4 x float> %sel +} + +define <16 x i8> @test_16i8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_16i8': +; SSE2: Cost Model: {{.*}} 32 for instruction: %sel = select <16 x i1> +; SSE41: Cost Model: {{.*}} 1 for instruction: %sel = select <16 x i1> +; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <16 x i1> +; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <16 x i1> + %sel = select <16 x i1> , <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %sel +} + +; AVX added blend instructions with an immediate for <4 x double> and +; <8 x float>. Integers of the same size should also use those instructions. +define <4 x i64> @test_4i64(<4 x i64> %a, <4 x i64> %b) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4i64': +; SSE2: Cost Model: {{.*}} 8 for instruction: %sel = select <4 x i1> +; SSE41: Cost Model: {{.*}} 2 for instruction: %sel = select <4 x i1> +; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> +; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> + %sel = select <4 x i1> , <4 x i64> %a, <4 x i64> %b + ret <4 x i64> %sel +} + +define <4 x double> @test_4double(<4 x double> %a, <4 x double> %b) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4double': +; SSE2: Cost Model: {{.*}} 6 for instruction: %sel = select <4 x i1> +; SSE41: Cost Model: {{.*}} 2 for instruction: %sel = select <4 x i1> +; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> +; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> + %sel = select <4 x i1> , <4 x double> %a, <4 x double> %b + ret <4 x double> %sel +} + +define <8 x i32> @test_8i32(<8 x i32> %a, <8 x i32> %b) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_8i32': +; SSE2: Cost Model: {{.*}} 16 for instruction: %sel = select <8 x i1> +; SSE41: Cost Model: {{.*}} 2 for instruction: %sel = select <8 x i1> +; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <8 x i1> +; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <8 x i1> + %sel = select <8 x i1> , <8 x i32> %a, <8 x i32> %b + ret <8 x i32> %sel +} + +define <8 x float> @test_8float(<8 x float> %a, <8 x float> %b) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_8float': +; SSE2: Cost Model: {{.*}} 14 for instruction: %sel = select <8 x i1> +; SSE41: Cost Model: {{.*}} 2 for instruction: %sel = select <8 x i1> +; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <8 x i1> +; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <8 x i1> + %sel = select <8 x i1> , <8 x float> %a, <8 x float> %b + ret <8 x float> %sel +} + +; AVX2 +define <16 x i16> @test_16i16(<16 x i16> %a, <16 x i16> %b) { +; CHECK:Printing analysis 'Cost Model Analysis' for function 'test_16i16': +; SSE2: Cost Model: {{.*}} 32 for instruction: %sel = select <16 x i1> +; SSE41: Cost Model: {{.*}} 2 for instruction: %sel = select <16 x i1> +;;; FIXME: This AVX cost is obviously wrong. We shouldn't be scalarizing. +; AVX: Cost Model: {{.*}} 32 for instruction: %sel = select <16 x i1> +; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <16 x i1> + %sel = select <16 x i1> , <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %sel +} + +define <32 x i8> @test_32i8(<32 x i8> %a, <32 x i8> %b) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_32i8': +; SSE2: Cost Model: {{.*}} 64 for instruction: %sel = select <32 x i1> +; SSE41: Cost Model: {{.*}} 2 for instruction: %sel = select <32 x i1> +;;; FIXME: This AVX cost is obviously wrong. We shouldn't be scalarizing. +; AVX: Cost Model: {{.*}} 64 for instruction: %sel = select <32 x i1> +; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <32 x i1> + %sel = select <32 x i1> , <32 x i8> %a, <32 x i8> %b + ret <32 x i8> %sel +} + Index: test/CodeGen/X86/avx-blend.ll =================================================================== --- test/CodeGen/X86/avx-blend.ll +++ test/CodeGen/X86/avx-blend.ll @@ -3,7 +3,7 @@ ; AVX128 tests: ;CHECK-LABEL: vsel_float: -;CHECK: vblendvps +;CHECK: vblendps $5 ;CHECK: ret define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 @@ -12,7 +12,7 @@ ;CHECK-LABEL: vsel_i32: -;CHECK: vblendvps +;CHECK: vblendps $5 ;CHECK: ret define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { %vsel = select <4 x i1> , <4 x i32> %v1, <4 x i32> %v2 @@ -52,7 +52,7 @@ ;CHECK-LABEL: vsel_float8: ;CHECK-NOT: vinsertf128 -;CHECK: vblendvps +;CHECK: vblendps $17 ;CHECK: ret define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) { %vsel = select <8 x i1> , <8 x float> %v1, <8 x float> %v2 @@ -61,7 +61,7 @@ ;CHECK-LABEL: vsel_i328: ;CHECK-NOT: vinsertf128 -;CHECK: vblendvps +;CHECK: vblendps $17 ;CHECK-NEXT: ret define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) { %vsel = select <8 x i1> , <8 x i32> %v1, <8 x i32> %v2 @@ -69,7 +69,8 @@ } ;CHECK-LABEL: vsel_double8: -;CHECK: vblendvpd +;CHECK: vblendpd $1 +;CHECK: vblendpd $1 ;CHECK: ret define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) { %vsel = select <8 x i1> , <8 x double> %v1, <8 x double> %v2 @@ -77,7 +78,8 @@ } ;CHECK-LABEL: vsel_i648: -;CHECK: vblendvpd +;CHECK: vblendpd $1 +;CHECK: vblendpd $1 ;CHECK: ret define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { %vsel = select <8 x i1> , <8 x i64> %v1, <8 x i64> %v2 @@ -86,7 +88,7 @@ ;CHECK-LABEL: vsel_double4: ;CHECK-NOT: vinsertf128 -;CHECK: vblendvpd +;CHECK: vshufpd $10 ;CHECK-NEXT: ret define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) { %vsel = select <4 x i1> , <4 x double> %v1, <4 x double> %v2 @@ -111,5 +113,3 @@ %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y ret <2 x double> %min } - - Index: test/CodeGen/X86/avx2.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx2.ll @@ -0,0 +1,136 @@ +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 | FileCheck %s -check-prefix=X32 --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s -check-prefix=X64 --check-prefix=CHECK + +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone + +define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @blendvb_fallback_v4i32 +; CHECK: vblendvps +; CHECK: ret + %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y + ret <4 x i32> %ret +} + +define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) { +; CHECK-LABEL: @blendvb_fallback_v8i32 +; CHECK: vblendvps +; CHECK: ret + %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y + ret <8 x i32> %ret +} + +define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) { +; CHECK-LABEL: @blendvb_fallback_v8f32 +; CHECK: vblendvps +; CHECK: ret + %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y + ret <8 x float> %ret +} + +define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; CHECK-LABEL: insertps_from_vector_load: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = load <4 x float>* %pb, align 16 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) + ret <4 x float> %2 +} + +;; Use a non-zero CountS for insertps +define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; CHECK-LABEL: insertps_from_vector_load_offset: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; CHECK-NOT: mov +;; Try to match a bit more of the instr, since we need the load's offset. +; CHECK: insertps $96, 4(%{{...}}), % +; CHECK-NEXT: ret + %1 = load <4 x float>* %pb, align 16 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) + ret <4 x float> %2 +} + +define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { +; CHECK-LABEL: insertps_from_vector_load_offset_2: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; X32: movl 8(%esp), %ecx +; CHECK-NOT: mov +;; Try to match a bit more of the instr, since we need the load's offset. +; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), % +; CHECK-NEXT: ret + %1 = getelementptr inbounds <4 x float>* %pb, i64 %index + %2 = load <4 x float>* %1, align 16 + %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) + ret <4 x float> %3 +} + +define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { +; CHECK-LABEL: insertps_from_broadcast_loadf32: +; On X32, account for the arguments' move to registers +; X32: movl 8(%esp), %eax +; X32: movl 4(%esp), %ecx +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + +define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) { +; CHECK-LABEL: insertps_from_broadcast_loadv4f32: +; On X32, account for the arguments' move to registers +; X32: movl 4(%esp), %{{...}} +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = load <4 x float>* %b, align 4 + %2 = extractelement <4 x float> %1, i32 0 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + +;; FIXME: We're emitting an extraneous pshufd/vbroadcast. +define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { +; CHECK-LABEL: insertps_from_broadcast_multiple_use: +; On X32, account for the arguments' move to registers +; X32: movl 8(%esp), %eax +; X32: movl 4(%esp), %ecx +; CHECK: vbroadcastss +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: vaddps +; CHECK: vaddps +; CHECK: vaddps +; CHECK-NEXT: ret + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48) + %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48) + %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48) + %11 = fadd <4 x float> %7, %8 + %12 = fadd <4 x float> %9, %10 + %13 = fadd <4 x float> %11, %12 + ret <4 x float> %13 +} Index: test/CodeGen/X86/blend-msb.ll =================================================================== --- test/CodeGen/X86/blend-msb.ll +++ test/CodeGen/X86/blend-msb.ll @@ -4,7 +4,7 @@ ; Verify that we produce movss instead of blendvps when possible. ;CHECK-LABEL: vsel_float: -;CHECK-NOT: blendvps +;CHECK-NOT: blend ;CHECK: movss ;CHECK: ret define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { @@ -13,7 +13,7 @@ } ;CHECK-LABEL: vsel_4xi8: -;CHECK-NOT: blendvps +;CHECK-NOT: blend ;CHECK: movss ;CHECK: ret define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { @@ -21,14 +21,8 @@ ret <4 x i8> %vsel } - -; We do not have native support for v8i16 blends and we have to use the -; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not -; reduce the mask in this case. ;CHECK-LABEL: vsel_8xi16: -;CHECK: andps -;CHECK: andps -;CHECK: orps +;CHECK: pblendw $17 ;CHECK: ret define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) { %vsel = select <8 x i1> , <8 x i16> %v1, <8 x i16> %v2 Index: test/CodeGen/X86/fold-load-vec.ll =================================================================== --- test/CodeGen/X86/fold-load-vec.ll +++ test/CodeGen/X86/fold-load-vec.ll @@ -5,7 +5,7 @@ ; loads from m32. define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind { ; CHECK: sample_test -; CHECK: movaps +; CHECK-NOT: movaps ; CHECK: insertps entry: %source.addr = alloca <4 x float>*, align 8 Index: test/CodeGen/X86/sse41-blend.ll =================================================================== --- test/CodeGen/X86/sse41-blend.ll +++ test/CodeGen/X86/sse41-blend.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s ;CHECK-LABEL: vsel_float: -;CHECK: blendvps +;CHECK: blendps ;CHECK: ret define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 @@ -10,7 +10,7 @@ ;CHECK-LABEL: vsel_4xi8: -;CHECK: blendvps +;CHECK: blendps ;CHECK: ret define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { %vsel = select <4 x i1> , <4 x i8> %v1, <4 x i8> %v2 @@ -18,7 +18,7 @@ } ;CHECK-LABEL: vsel_4xi16: -;CHECK: blendvps +;CHECK: blendps ;CHECK: ret define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) { %vsel = select <4 x i1> , <4 x i16> %v1, <4 x i16> %v2 @@ -27,7 +27,7 @@ ;CHECK-LABEL: vsel_i32: -;CHECK: blendvps +;CHECK: blendps ;CHECK: ret define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { %vsel = select <4 x i1> , <4 x i32> %v1, <4 x i32> %v2 Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -576,3 +576,119 @@ %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5 ret <4 x float> %res } + +define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: blendvb_fallback +; CHECK: blendvb +; CHECK: ret + %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y + ret <8 x i16> %ret +} + +define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; CHECK-LABEL: insertps_from_vector_load: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = load <4 x float>* %pb, align 16 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) + ret <4 x float> %2 +} + +;; Use a non-zero CountS for insertps +define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; CHECK-LABEL: insertps_from_vector_load_offset: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; CHECK-NOT: mov +;; Try to match a bit more of the instr, since we need the load's offset. +; CHECK: insertps $96, 4(%{{...}}), % +; CHECK-NEXT: ret + %1 = load <4 x float>* %pb, align 16 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) + ret <4 x float> %2 +} + +define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { +; CHECK-LABEL: insertps_from_vector_load_offset_2: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; X32: movl 8(%esp), %ecx +; CHECK-NOT: mov +;; Try to match a bit more of the instr, since we need the load's offset. +; CHECK: insertps $192, 12(%{{...}},%{{...}}), % +; CHECK-NEXT: ret + %1 = getelementptr inbounds <4 x float>* %pb, i64 %index + %2 = load <4 x float>* %1, align 16 + %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) + ret <4 x float> %3 +} + +define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { +; CHECK-LABEL: insertps_from_broadcast_loadf32: +; On X32, account for the arguments' move to registers +; X32: movl 8(%esp), %eax +; X32: movl 4(%esp), %ecx +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + +define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) { +; CHECK-LABEL: insertps_from_broadcast_loadv4f32: +; On X32, account for the arguments' move to registers +; X32: movl 4(%esp), %{{...}} +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = load <4 x float>* %b, align 4 + %2 = extractelement <4 x float> %1, i32 0 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + +;; FIXME: We're emitting an extraneous pshufd/vbroadcast. +define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { +; CHECK-LABEL: insertps_from_broadcast_multiple_use: +; On X32, account for the arguments' move to registers +; X32: movl 8(%esp), %eax +; X32: movl 4(%esp), %ecx +; CHECK: movss +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: addps +; CHECK: addps +; CHECK: addps +; CHECK-NEXT: ret + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48) + %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48) + %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48) + %11 = fadd <4 x float> %7, %8 + %12 = fadd <4 x float> %9, %10 + %13 = fadd <4 x float> %11, %12 + ret <4 x float> %13 +} Index: test/DebugInfo/llvm-symbolizer.test =================================================================== --- test/DebugInfo/llvm-symbolizer.test +++ test/DebugInfo/llvm-symbolizer.test @@ -10,7 +10,7 @@ RUN: echo "%p/Inputs/macho-universal:x86_64 0x100000f05" >> %t.input RUN: echo "%p/Inputs/llvm-symbolizer-dwo-test 0x400514" >> %t.input -RUN: llvm-symbolizer --functions --inlining --demangle=false \ +RUN: llvm-symbolizer --functions=linkage --inlining --demangle=false \ RUN: --default-arch=i386 < %t.input | FileCheck %s CHECK: main @@ -87,3 +87,9 @@ RUN: | FileCheck %s --check-prefix=STRIPPED STRIPPED: global_func + +RUN: echo "%p/Inputs/dwarfdump-test4.elf-x86-64 0x62c" > %t.input7 +RUN: llvm-symbolizer --functions=short --use-symbol-table=false --demangle=false < %t.input7 \ +RUN: | FileCheck %s --check-prefix=SHORT_FUNCTION_NAME + +SHORT_FUNCTION_NAME-NOT: _Z1cv Index: test/Transforms/InstCombine/gepphigep.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/gepphigep.ll @@ -0,0 +1,56 @@ +; RUN: opt -instcombine -S < %s | FileCheck %s + +%struct1 = type { %struct2*, i32, i32, i32 } +%struct2 = type { i32, i32 } + +define i32 @test1(%struct1* %dm, i1 %tmp4, i64 %tmp9, i64 %tmp19) { +bb: + %tmp = getelementptr inbounds %struct1* %dm, i64 0, i32 0 + %tmp1 = load %struct2** %tmp, align 8 + br i1 %tmp4, label %bb1, label %bb2 + +bb1: + %tmp10 = getelementptr inbounds %struct2* %tmp1, i64 %tmp9 + %tmp11 = getelementptr inbounds %struct2* %tmp10, i64 0, i32 0 + store i32 0, i32* %tmp11, align 4 + br label %bb3 + +bb2: + %tmp20 = getelementptr inbounds %struct2* %tmp1, i64 %tmp19 + %tmp21 = getelementptr inbounds %struct2* %tmp20, i64 0, i32 0 + store i32 0, i32* %tmp21, align 4 + br label %bb3 + +bb3: + %phi = phi %struct2* [ %tmp10, %bb1 ], [ %tmp20, %bb2 ] + %tmp24 = getelementptr inbounds %struct2* %phi, i64 0, i32 1 + %tmp25 = load i32* %tmp24, align 4 + ret i32 %tmp25 + +; CHECK-LABEL: @test1( +; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp9, i32 0 +; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp19, i32 0 +; CHECK: %[[PHI:[0-9A-Za-z]+]] = phi i64 [ %tmp9, %bb1 ], [ %tmp19, %bb2 ] +; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %[[PHI]], i32 1 +} + +define i32 @test2(%struct1* %dm, i1 %tmp4, i64 %tmp9, i64 %tmp19) { +bb: + %tmp = getelementptr inbounds %struct1* %dm, i64 0, i32 0 + %tmp1 = load %struct2** %tmp, align 8 + %tmp10 = getelementptr inbounds %struct2* %tmp1, i64 %tmp9 + %tmp11 = getelementptr inbounds %struct2* %tmp10, i64 0, i32 0 + store i32 0, i32* %tmp11, align 4 + %tmp20 = getelementptr inbounds %struct2* %tmp1, i64 %tmp19 + %tmp21 = getelementptr inbounds %struct2* %tmp20, i64 0, i32 0 + store i32 0, i32* %tmp21, align 4 + %tmp24 = getelementptr inbounds %struct2* %tmp10, i64 0, i32 1 + %tmp25 = load i32* %tmp24, align 4 + ret i32 %tmp25 + +; CHECK-LABEL: @test2( +; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp9, i32 0 +; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp19, i32 0 +; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp9, i32 1 +} + Index: tools/llvm-symbolizer/LLVMSymbolize.h =================================================================== --- tools/llvm-symbolizer/LLVMSymbolize.h +++ tools/llvm-symbolizer/LLVMSymbolize.h @@ -24,6 +24,7 @@ namespace llvm { +typedef DILineInfoSpecifier::FunctionNameKind FunctionNameKind; using namespace object; namespace symbolize { @@ -34,17 +35,17 @@ public: struct Options { bool UseSymbolTable : 1; - bool PrintFunctions : 1; + FunctionNameKind PrintFunctions; bool PrintInlining : 1; bool Demangle : 1; std::string DefaultArch; - Options(bool UseSymbolTable = true, bool PrintFunctions = true, + Options(bool UseSymbolTable = true, + FunctionNameKind PrintFunctions = FunctionNameKind::LinkageName, bool PrintInlining = true, bool Demangle = true, std::string DefaultArch = "") : UseSymbolTable(UseSymbolTable), PrintFunctions(PrintFunctions), PrintInlining(PrintInlining), Demangle(Demangle), - DefaultArch(DefaultArch) { - } + DefaultArch(DefaultArch) {} }; LLVMSymbolizer(const Options &Opts = Options()) : Opts(Opts) {} Index: tools/llvm-symbolizer/LLVMSymbolize.cpp =================================================================== --- tools/llvm-symbolizer/LLVMSymbolize.cpp +++ tools/llvm-symbolizer/LLVMSymbolize.cpp @@ -39,8 +39,7 @@ getDILineInfoSpecifier(const LLVMSymbolizer::Options &Opts) { return DILineInfoSpecifier( DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, - Opts.PrintFunctions ? DILineInfoSpecifier::FunctionNameKind::LinkageName - : DILineInfoSpecifier::FunctionNameKind::None); + Opts.PrintFunctions); } ModuleInfo::ModuleInfo(ObjectFile *Obj, DIContext *DICtx) @@ -117,7 +116,7 @@ ModuleOffset, getDILineInfoSpecifier(Opts)); } // Override function name from symbol table if necessary. - if (Opts.PrintFunctions && Opts.UseSymbolTable) { + if (Opts.PrintFunctions != FunctionNameKind::None && Opts.UseSymbolTable) { std::string FunctionName; uint64_t Start, Size; if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset, @@ -140,7 +139,7 @@ InlinedContext.addFrame(DILineInfo()); } // Override the function name in lower frame with name from symbol table. - if (Opts.PrintFunctions && Opts.UseSymbolTable) { + if (Opts.PrintFunctions != FunctionNameKind::None && Opts.UseSymbolTable) { DIInliningInfo PatchedInlinedContext; for (uint32_t i = 0, n = InlinedContext.getNumberOfFrames(); i < n; i++) { DILineInfo LineInfo = InlinedContext.getFrame(i); @@ -398,7 +397,7 @@ // cannot fetch. We replace it to "??" to make our output closer to addr2line. static const std::string kDILineInfoBadString = ""; std::stringstream Result; - if (Opts.PrintFunctions) { + if (Opts.PrintFunctions != FunctionNameKind::None) { std::string FunctionName = LineInfo.FunctionName; if (FunctionName == kDILineInfoBadString) FunctionName = kBadString; Index: tools/llvm-symbolizer/llvm-symbolizer.cpp =================================================================== --- tools/llvm-symbolizer/llvm-symbolizer.cpp +++ tools/llvm-symbolizer/llvm-symbolizer.cpp @@ -35,10 +35,15 @@ cl::desc("Prefer names in symbol table to names " "in debug info")); -static cl::opt -ClPrintFunctions("functions", cl::init(true), - cl::desc("Print function names as well as line " - "information for a given address")); +static cl::opt ClPrintFunctions( + "functions", cl::init(FunctionNameKind::LinkageName), + cl::desc("Print function name for a given address:"), + cl::values(clEnumValN(FunctionNameKind::None, "none", "omit function name"), + clEnumValN(FunctionNameKind::ShortName, "short", + "print short function name"), + clEnumValN(FunctionNameKind::LinkageName, "linkage", + "print function linkage name"), + clEnumValEnd)); static cl::opt ClPrintInlining("inlining", cl::init(true),