Index: llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -226,7 +226,7 @@ SDValue &Index, SDValue &Disp, SDValue &Segment); - // Convience method where P is also root. + // Convenience method where P is also root. bool tryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, @@ -234,6 +234,12 @@ return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); } + // Try to fold a vector load. This makes sure the load isn't non-temporal. + bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment); + /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, @@ -449,6 +455,12 @@ bool matchBEXTRFromAnd(SDNode *Node); bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; + + MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, + const SDLoc &dl, MVT VT, SDNode *Node); + MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, + const SDLoc &dl, MVT VT, SDNode *Node, + SDValue &InFlag); }; } @@ -2006,6 +2018,20 @@ N.getOperand(1), Base, Scale, Index, Disp, Segment); } +bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + if (!ISD::isNON_EXTLoad(N.getNode()) || + useNonTemporalLoad(cast(N)) || + !IsProfitableToFold(N, P, Root) || + !IsLegalToFold(N, P, Root, OptLevel)) + return false; + + return selectAddr(N.getNode(), + N.getOperand(1), Base, Scale, Index, Disp, Segment); +} + /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. @@ -2563,6 +2589,83 @@ return true; } +// Emit a PCMISTR(I/M) instruction. +MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, + bool MayFoldLoad, const SDLoc &dl, + MVT VT, SDNode *Node) { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + SDValue Imm = Node->getOperand(2); + const ConstantInt *Val = cast(Imm)->getConstantIntValue(); + Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); + + // If there is a load, it will be behind a bitcast. We don't need to check + // alignment on this load. + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() && + tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4)) { + SDValue Load = N1.getOperand(0); + SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, + Load.getOperand(0) }; + SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other); + MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + // Update the chain. + ReplaceUses(Load.getValue(1), SDValue(CNode, 2)); + // Record the mem-refs + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(Load)->getMemOperand(); + CNode->setMemRefs(MemOp, MemOp + 1); + return CNode; + } + + SDValue Ops[] = { N0, N1, Imm }; + SDVTList VTs = CurDAG->getVTList(VT, MVT::i32); + MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); + return CNode; +} + +// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need +// to emit a second instruction after this one. This is needed since we have two +// copyToReg nodes glued before this and we need to continue that glue through. +MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, + bool MayFoldLoad, const SDLoc &dl, + MVT VT, SDNode *Node, + SDValue &InFlag) { + SDValue N0 = Node->getOperand(0); + SDValue N2 = Node->getOperand(2); + SDValue Imm = Node->getOperand(4); + const ConstantInt *Val = cast(Imm)->getConstantIntValue(); + Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); + + // If there is a load, it will be behind a bitcast. We don't need to check + // alignment on this load. + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() && + tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4)) { + SDValue Load = N2.getOperand(0); + SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, + Load.getOperand(0), InFlag }; + SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue); + MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + InFlag = SDValue(CNode, 3); + // Update the chain. + ReplaceUses(Load.getValue(1), SDValue(CNode, 2)); + // Record the mem-refs + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(Load)->getMemOperand(); + CNode->setMemRefs(MemOp, MemOp + 1); + return CNode; + } + + SDValue Ops[] = { N0, N2, Imm, InFlag }; + SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue); + MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); + InFlag = SDValue(CNode, 2); + return CNode; +} + /// If the high bits of an 'and' operand are known zero, try setting the /// high bits of an 'and' constant operand to produce a smaller encoding by /// creating a small, sign-extended negative immediate rather than a large @@ -3184,6 +3287,70 @@ } break; } + case X86ISD::PCMPISTR: { + if (!Subtarget->hasSSE42()) + break; + + bool NeedIndex = !SDValue(Node, 0).use_empty(); + bool NeedMask = !SDValue(Node, 1).use_empty(); + // We can't fold a load if we are going to make two instructions. + bool MayFoldLoad = !NeedIndex || !NeedMask; + + MachineSDNode *CNode; + if (NeedMask) { + unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr; + unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm; + CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node); + ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); + } + if (NeedIndex || !NeedMask) { + unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr; + unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm; + CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node); + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + } + + // Connect the flag usage to the last instruction created. + ReplaceUses(SDValue(Node, 2), SDValue(CNode, 0)); + CurDAG->RemoveDeadNode(Node); + return; + } + case X86ISD::PCMPESTR: { + if (!Subtarget->hasSSE42()) + break; + + // Copy the two implicit register inputs. + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX, + Node->getOperand(1), + SDValue()).getValue(1); + InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX, + Node->getOperand(3), InFlag).getValue(1); + + bool NeedIndex = !SDValue(Node, 0).use_empty(); + bool NeedMask = !SDValue(Node, 1).use_empty(); + // We can't fold a load if we are going to make two instructions. + bool MayFoldLoad = !NeedIndex || !NeedMask; + + MachineSDNode *CNode; + if (NeedMask) { + unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr; + unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm; + CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, + InFlag); + ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); + } + if (NeedIndex || !NeedMask) { + unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr; + unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm; + CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag); + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + } + // Connect the flag usage to the last instruction created. + ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); + CurDAG->RemoveDeadNode(Node); + return; + } + case ISD::STORE: if (foldLoadStoreIntoMemOperand(Node)) return; Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -576,8 +576,13 @@ RDSEED, // SSE42 string comparisons. - PCMPISTRI, - PCMPESTRI, + // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG + // will emit one or two instructions based on which results are used. If + // flags and index/mask this allows us to use a single instruction since + // we won't have to pick and opcode for flags. Instead we can rely on the + // DAG to CSE everything and decide at isel. + PCMPISTR, + PCMPESTR, // Test if in transactional execution. XTEST, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -20947,50 +20947,50 @@ switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse42_pcmpistria128: - Opcode = X86ISD::PCMPISTRI; + Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpestria128: - Opcode = X86ISD::PCMPESTRI; + Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpistric128: - Opcode = X86ISD::PCMPISTRI; + Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpestric128: - Opcode = X86ISD::PCMPESTRI; + Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpistrio128: - Opcode = X86ISD::PCMPISTRI; + Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpestrio128: - Opcode = X86ISD::PCMPESTRI; + Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpistris128: - Opcode = X86ISD::PCMPISTRI; + Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpestris128: - Opcode = X86ISD::PCMPESTRI; + Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpistriz128: - Opcode = X86ISD::PCMPISTRI; + Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_E; break; case Intrinsic::x86_sse42_pcmpestriz128: - Opcode = X86ISD::PCMPESTRI; + Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_E; break; } SmallVector NewOps(Op->op_begin()+1, Op->op_end()); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); - SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); + SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2); + SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } @@ -20998,15 +20998,28 @@ case Intrinsic::x86_sse42_pcmpestri128: { unsigned Opcode; if (IntNo == Intrinsic::x86_sse42_pcmpistri128) - Opcode = X86ISD::PCMPISTRI; + Opcode = X86ISD::PCMPISTR; else - Opcode = X86ISD::PCMPESTRI; + Opcode = X86ISD::PCMPESTR; SmallVector NewOps(Op->op_begin()+1, Op->op_end()); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps); } + case Intrinsic::x86_sse42_pcmpistrm128: + case Intrinsic::x86_sse42_pcmpestrm128: { + unsigned Opcode; + if (IntNo == Intrinsic::x86_sse42_pcmpistrm128) + Opcode = X86ISD::PCMPISTR; + else + Opcode = X86ISD::PCMPESTR; + + SmallVector NewOps(Op->op_begin()+1, Op->op_end()); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); + return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1); + } + case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -25794,8 +25807,8 @@ case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND"; case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS"; case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND"; - case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; - case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; + case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR"; + case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR"; case X86ISD::XTEST: return "X86ISD::XTEST"; case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; case X86ISD::EXPAND: return "X86ISD::EXPAND"; @@ -26179,79 +26192,6 @@ return sinkMBB; } -// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 -// or XMM0_V32I8 in AVX all of this code can be replaced with that -// in the .td file. -static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB, - const TargetInstrInfo *TII) { - unsigned Opc; - switch (MI.getOpcode()) { - default: llvm_unreachable("illegal opcode!"); - case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; - case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; - case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; - case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; - case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; - case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; - case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; - case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; - } - - DebugLoc dl = MI.getDebugLoc(); - MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); - - unsigned NumArgs = MI.getNumOperands(); - for (unsigned i = 1; i < NumArgs; ++i) { - MachineOperand &Op = MI.getOperand(i); - if (!(Op.isReg() && Op.isImplicit())) - MIB.add(Op); - } - if (MI.hasOneMemOperand()) - MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) - .addReg(X86::XMM0); - - MI.eraseFromParent(); - return BB; -} - -// FIXME: Custom handling because TableGen doesn't support multiple implicit -// defs in an instruction pattern -static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB, - const TargetInstrInfo *TII) { - unsigned Opc; - switch (MI.getOpcode()) { - default: llvm_unreachable("illegal opcode!"); - case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; - case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; - case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; - case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; - case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; - case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; - case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; - case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; - } - - DebugLoc dl = MI.getDebugLoc(); - MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); - - unsigned NumArgs = MI.getNumOperands(); // remove the results - for (unsigned i = 1; i < NumArgs; ++i) { - MachineOperand &Op = MI.getOperand(i); - if (!(Op.isReg() && Op.isImplicit())) - MIB.add(Op); - } - if (MI.hasOneMemOperand()) - MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) - .addReg(X86::ECX); - - MI.eraseFromParent(); - return BB; -} - static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB, const X86Subtarget &Subtarget) { DebugLoc dl = MI.getDebugLoc(); @@ -28167,32 +28107,6 @@ MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } - // String/text processing lowering. - case X86::PCMPISTRM128REG: - case X86::VPCMPISTRM128REG: - case X86::PCMPISTRM128MEM: - case X86::VPCMPISTRM128MEM: - case X86::PCMPESTRM128REG: - case X86::VPCMPESTRM128REG: - case X86::PCMPESTRM128MEM: - case X86::VPCMPESTRM128MEM: - assert(Subtarget.hasSSE42() && - "Target must have SSE4.2 or AVX features enabled"); - return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo()); - - // String/text processing lowering. - case X86::PCMPISTRIREG: - case X86::VPCMPISTRIREG: - case X86::PCMPISTRIMEM: - case X86::VPCMPISTRIMEM: - case X86::PCMPESTRIREG: - case X86::VPCMPESTRIREG: - case X86::PCMPESTRIMEM: - case X86::VPCMPESTRIMEM: - assert(Subtarget.hasSSE42() && - "Target must have SSE4.2 or AVX features enabled"); - return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo()); - // Thread synchronization. case X86::MONITOR: return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr); Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -555,17 +555,6 @@ def X86ReducesRnd : SDNode<"X86ISD::VREDUCES_RND", SDTFPBinOpImmRound>; def X86GetMantsRnd : SDNode<"X86ISD::VGETMANTS_RND", SDTFPBinOpImmRound>; -def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, - SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, - SDTCisVT<4, i8>]>; -def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, - SDTCisVT<2, v16i8>, SDTCisVT<3, i32>, - SDTCisVT<4, v16i8>, SDTCisVT<5, i32>, - SDTCisVT<6, i8>]>; - -def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>; -def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>; - def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1, Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -632,9 +632,9 @@ { X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 }, { X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 }, { X86::PCMPESTRIrr, X86::PCMPESTRIrm, 0 }, - { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, 0 }, + { X86::PCMPESTRMrr, X86::PCMPESTRMrm, 0 }, { X86::PCMPISTRIrr, X86::PCMPISTRIrm, 0 }, - { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, 0 }, + { X86::PCMPISTRMrr, X86::PCMPISTRMrm, 0 }, { X86::PHMINPOSUWrr, X86::PHMINPOSUWrm, TB_ALIGN_16 }, { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE }, { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE }, @@ -736,10 +736,10 @@ { X86::VPABSDrr, X86::VPABSDrm, 0 }, { X86::VPABSWrr, X86::VPABSWrm, 0 }, { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 }, - { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 }, + { X86::VPCMPESTRMrr, X86::VPCMPESTRMrm, 0 }, { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 }, - { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 }, - { X86::VPHMINPOSUWrr, X86::VPHMINPOSUWrm, 0 }, + { X86::VPCMPISTRMrr, X86::VPCMPISTRMrm, 0 }, + { X86::VPHMINPOSUWrr, X86::VPHMINPOSUWrm, 0 }, { X86::VPERMILPDri, X86::VPERMILPDmi, 0 }, { X86::VPERMILPSri, X86::VPERMILPSmi, 0 }, { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE }, Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -6383,25 +6383,6 @@ // SSE4.2 - String/text Processing Instructions //===----------------------------------------------------------------------===// -// Packed Compare Implicit Length Strings, Return Mask -multiclass pseudo_pcmpistrm { - def REG : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, u8imm:$src3), - [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2, - imm:$src3))]>; - def MEM : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, u8imm:$src3), - [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, - (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; -} - -let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { - defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>, - Requires<[HasAVX]>, VEX_WIG; - defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", loadv2i64>, - Requires<[UseSSE42]>; -} - multiclass pcmpistrm_SS42AI { def rr : SS42AI<0x62, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2, u8imm:$src3), @@ -6416,27 +6397,8 @@ let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { let Predicates = [HasAVX] in - defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; - defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ; -} - -// Packed Compare Explicit Length Strings, Return Mask -multiclass pseudo_pcmpestrm { - def REG : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, VR128:$src3, u8imm:$src5), - [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 - VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; - def MEM : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, i128mem:$src3, u8imm:$src5), - [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX, - (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>; -} - -let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { - defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>, - Requires<[HasAVX]>; - defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", loadv2i64>, - Requires<[UseSSE42]>; + defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; + defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; } multiclass SS42AI_pcmpestrm { @@ -6453,27 +6415,8 @@ let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { let Predicates = [HasAVX] in - defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; - defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">; -} - -// Packed Compare Implicit Length Strings, Return Index -multiclass pseudo_pcmpistri { - def REG : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, VR128:$src2, u8imm:$src3), - [(set GR32:$dst, EFLAGS, - (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>; - def MEM : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, i128mem:$src2, u8imm:$src3), - [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1, - (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; -} - -let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { - defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>, - Requires<[HasAVX]>, VEX_WIG; - defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", loadv2i64>, - Requires<[UseSSE42]>; + defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; + defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; } multiclass SS42AI_pcmpistri { @@ -6494,26 +6437,6 @@ defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; } -// Packed Compare Explicit Length Strings, Return Index -multiclass pseudo_pcmpestri { - def REG : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, VR128:$src3, u8imm:$src5), - [(set GR32:$dst, EFLAGS, - (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; - def MEM : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, i128mem:$src3, u8imm:$src5), - [(set GR32:$dst, EFLAGS, - (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX, - imm:$src5))]>; -} - -let Defs = [EFLAGS], Uses = [EAX, EDX], hasNoSchedulingInfo = 1, usesCustomInserter = 1 in { - defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>, - Requires<[HasAVX]>; - defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", loadv2i64>, - Requires<[UseSSE42]>; -} - multiclass SS42AI_pcmpestri { def rr : SS42AI<0x61, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src3, u8imm:$src5), Index: llvm/trunk/test/CodeGen/X86/sse42.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse42.ll +++ llvm/trunk/test/CodeGen/X86/sse42.ll @@ -1,964 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=sse4.2 -mcpu=x86-64 | FileCheck %s --check-prefix=X32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 -mcpu=x86-64 | FileCheck %s --check-prefix=X64 - -declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) -declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) -declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8) -declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8) - -define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { -; X32-LABEL: pcmpestri_reg_eq_i8: -; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 -; X32-NEXT: setae %al -; X32-NEXT: retl -; -; X64-LABEL: pcmpestri_reg_eq_i8: -; X64: # %bb.0: # %entry -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %esi, %edx -; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 -; X64-NEXT: setae %al -; X64-NEXT: retq -entry: - %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) - %result = icmp eq i32 %c, 0 - ret i1 %result -} - -define i32 @pcmpestri_reg_idx_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { -; X32-LABEL: pcmpestri_reg_idx_i8: -; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: retl -; -; X64-LABEL: pcmpestri_reg_idx_i8: -; X64: # %bb.0: # %entry -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %esi, %edx -; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: retq -entry: - %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) - ret i32 %idx -} - -define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { -; X32-LABEL: pcmpestri_reg_diff_i8: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-16, %esp -; X32-NEXT: subl $48, %esp -; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: movl 12(%ebp), %edx -; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 -; X32-NEXT: cmpl $16, %ecx -; X32-NEXT: jne .LBB2_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: jmp .LBB2_3 -; X32-NEXT: .LBB2_2: # %compare -; X32-NEXT: movdqa %xmm0, (%esp) -; X32-NEXT: andl $15, %ecx -; X32-NEXT: movb (%esp,%ecx), %al -; X32-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) -; X32-NEXT: subb 16(%esp,%ecx), %al -; X32-NEXT: .LBB2_3: # %exit -; X32-NEXT: movzbl %al, %eax -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp -; X32-NEXT: retl -; -; X64-LABEL: pcmpestri_reg_diff_i8: -; X64: # %bb.0: # %entry -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %esi, %edx -; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 -; X64-NEXT: # kill: def $ecx killed $ecx def $rcx -; X64-NEXT: cmpl $16, %ecx -; X64-NEXT: jne .LBB2_2 -; X64-NEXT: # %bb.1: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB2_2: # %compare -; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $15, %ecx -; X64-NEXT: movb -24(%rsp,%rcx), %al -; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: subb -40(%rsp,%rcx), %al -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: retq -entry: - %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) - %eq = icmp eq i32 %idx, 16 - br i1 %eq, label %exit, label %compare - -compare: - %lhs_c = extractelement <16 x i8> %lhs, i32 %idx - %rhs_c = extractelement <16 x i8> %rhs, i32 %idx - %sub = sub i8 %lhs_c, %rhs_c - br label %exit - -exit: - %result = phi i8 [ 0, %entry ], [ %sub, %compare ] - %result_ext = zext i8 %result to i32 - ret i32 %result_ext -} - -define i1 @pcmpestri_mem_eq_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { -; X32-LABEL: pcmpestri_mem_eq_i8: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movdqu (%esi), %xmm0 -; X32-NEXT: pcmpestri $24, (%ecx), %xmm0 -; X32-NEXT: setae %al -; X32-NEXT: popl %esi -; X32-NEXT: retl -; -; X64-LABEL: pcmpestri_mem_eq_i8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movl %esi, %eax -; X64-NEXT: movl %ecx, %edx -; X64-NEXT: pcmpestri $24, (%r8), %xmm0 -; X64-NEXT: setae %al -; X64-NEXT: retq -entry: - %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* - %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 - %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* - %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 - %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) - %result = icmp eq i32 %c, 0 - ret i1 %result -} - -define i32 @pcmpestri_mem_idx_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { -; X32-LABEL: pcmpestri_mem_idx_i8: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movdqu (%esi), %xmm0 -; X32-NEXT: pcmpestri $24, (%ecx), %xmm0 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: popl %esi -; X32-NEXT: retl -; -; X64-LABEL: pcmpestri_mem_idx_i8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movl %esi, %eax -; X64-NEXT: movl %ecx, %edx -; X64-NEXT: pcmpestri $24, (%r8), %xmm0 -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: retq -entry: - %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* - %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 - %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* - %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 - %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) - ret i32 %idx -} - -define i32 @pcmpestri_mem_diff_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { -; X32-LABEL: pcmpestri_mem_diff_i8: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: pushl %esi -; X32-NEXT: andl $-16, %esp -; X32-NEXT: subl $48, %esp -; X32-NEXT: movl 12(%ebp), %eax -; X32-NEXT: movl 20(%ebp), %edx -; X32-NEXT: movl 16(%ebp), %ecx -; X32-NEXT: movl 8(%ebp), %esi -; X32-NEXT: movdqu (%esi), %xmm1 -; X32-NEXT: movdqu (%ecx), %xmm0 -; X32-NEXT: pcmpestri $24, %xmm0, %xmm1 -; X32-NEXT: cmpl $16, %ecx -; X32-NEXT: jne .LBB5_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: jmp .LBB5_3 -; X32-NEXT: .LBB5_2: # %compare -; X32-NEXT: movdqa %xmm1, (%esp) -; X32-NEXT: andl $15, %ecx -; X32-NEXT: movb (%esp,%ecx), %al -; X32-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: subb 16(%esp,%ecx), %al -; X32-NEXT: .LBB5_3: # %exit -; X32-NEXT: movzbl %al, %eax -; X32-NEXT: leal -4(%ebp), %esp -; X32-NEXT: popl %esi -; X32-NEXT: popl %ebp -; X32-NEXT: retl -; -; X64-LABEL: pcmpestri_mem_diff_i8: -; X64: # %bb.0: # %entry -; X64-NEXT: movdqu (%rdi), %xmm1 -; X64-NEXT: movdqu (%rdx), %xmm0 -; X64-NEXT: movl %esi, %eax -; X64-NEXT: movl %ecx, %edx -; X64-NEXT: pcmpestri $24, %xmm0, %xmm1 -; X64-NEXT: # kill: def $ecx killed $ecx def $rcx -; X64-NEXT: cmpl $16, %ecx -; X64-NEXT: jne .LBB5_2 -; X64-NEXT: # %bb.1: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB5_2: # %compare -; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $15, %ecx -; X64-NEXT: movb -24(%rsp,%rcx), %al -; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: subb -40(%rsp,%rcx), %al -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: retq -entry: - %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* - %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 - %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* - %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 - %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) - %eq = icmp eq i32 %idx, 16 - br i1 %eq, label %exit, label %compare - -compare: - %lhs_c = extractelement <16 x i8> %lhs, i32 %idx - %rhs_c = extractelement <16 x i8> %rhs, i32 %idx - %sub = sub i8 %lhs_c, %rhs_c - br label %exit - -exit: - %result = phi i8 [ 0, %entry ], [ %sub, %compare ] - %result_ext = zext i8 %result to i32 - ret i32 %result_ext -} - -define i1 @pcmpestri_reg_eq_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { -; X32-LABEL: pcmpestri_reg_eq_i16: -; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 -; X32-NEXT: setae %al -; X32-NEXT: retl -; -; X64-LABEL: pcmpestri_reg_eq_i16: -; X64: # %bb.0: # %entry -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %esi, %edx -; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 -; X64-NEXT: setae %al -; X64-NEXT: retq -entry: - %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> - %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> - %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) - %result = icmp eq i32 %c, 0 - ret i1 %result -} - -define i32 @pcmpestri_reg_idx_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { -; X32-LABEL: pcmpestri_reg_idx_i16: -; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: retl -; -; X64-LABEL: pcmpestri_reg_idx_i16: -; X64: # %bb.0: # %entry -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %esi, %edx -; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: retq -entry: - %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> - %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> - %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) - ret i32 %idx -} - -define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { -; X32-LABEL: pcmpestri_reg_diff_i16: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-16, %esp -; X32-NEXT: subl $48, %esp -; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: movl 12(%ebp), %edx -; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 -; X32-NEXT: cmpl $16, %ecx -; X32-NEXT: jne .LBB8_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: jmp .LBB8_3 -; X32-NEXT: .LBB8_2: # %compare -; X32-NEXT: movdqa %xmm0, (%esp) -; X32-NEXT: addl %ecx, %ecx -; X32-NEXT: andl $14, %ecx -; X32-NEXT: movzwl (%esp,%ecx), %eax -; X32-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) -; X32-NEXT: subw 16(%esp,%ecx), %ax -; X32-NEXT: .LBB8_3: # %exit -; X32-NEXT: movzwl %ax, %eax -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp -; X32-NEXT: retl -; -; X64-LABEL: pcmpestri_reg_diff_i16: -; X64: # %bb.0: # %entry -; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %esi, %edx -; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 -; X64-NEXT: # kill: def $ecx killed $ecx def $rcx -; X64-NEXT: cmpl $16, %ecx -; X64-NEXT: jne .LBB8_2 -; X64-NEXT: # %bb.1: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB8_2: # %compare -; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $7, %ecx -; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax -; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: subw -40(%rsp,%rcx,2), %ax -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: retq -entry: - %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> - %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> - %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) - %eq = icmp eq i32 %idx, 16 - br i1 %eq, label %exit, label %compare - -compare: - %lhs_c = extractelement <8 x i16> %lhs, i32 %idx - %rhs_c = extractelement <8 x i16> %rhs, i32 %idx - %sub = sub i16 %lhs_c, %rhs_c - br label %exit - -exit: - %result = phi i16 [ 0, %entry ], [ %sub, %compare ] - %result_ext = zext i16 %result to i32 - ret i32 %result_ext -} - -define i1 @pcmpestri_mem_eq_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { -; X32-LABEL: pcmpestri_mem_eq_i16: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movdqu (%esi), %xmm0 -; X32-NEXT: pcmpestri $25, (%ecx), %xmm0 -; X32-NEXT: setae %al -; X32-NEXT: popl %esi -; X32-NEXT: retl -; -; X64-LABEL: pcmpestri_mem_eq_i16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movl %esi, %eax -; X64-NEXT: movl %ecx, %edx -; X64-NEXT: pcmpestri $25, (%r8), %xmm0 -; X64-NEXT: setae %al -; X64-NEXT: retq -entry: - %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* - %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 - %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* - %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 - %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> - %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> - %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) - %result = icmp eq i32 %c, 0 - ret i1 %result -} - -define i32 @pcmpestri_mem_idx_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { -; X32-LABEL: pcmpestri_mem_idx_i16: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movdqu (%esi), %xmm0 -; X32-NEXT: pcmpestri $25, (%ecx), %xmm0 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: popl %esi -; X32-NEXT: retl -; -; X64-LABEL: pcmpestri_mem_idx_i16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movl %esi, %eax -; X64-NEXT: movl %ecx, %edx -; X64-NEXT: pcmpestri $25, (%r8), %xmm0 -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: retq -entry: - %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* - %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 - %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* - %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 - %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> - %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> - %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) - ret i32 %idx -} - -define i32 @pcmpestri_mem_diff_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { -; X32-LABEL: pcmpestri_mem_diff_i16: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: pushl %esi -; X32-NEXT: andl $-16, %esp -; X32-NEXT: subl $48, %esp -; X32-NEXT: movl 12(%ebp), %eax -; X32-NEXT: movl 20(%ebp), %edx -; X32-NEXT: movl 16(%ebp), %ecx -; X32-NEXT: movl 8(%ebp), %esi -; X32-NEXT: movdqu (%esi), %xmm1 -; X32-NEXT: movdqu (%ecx), %xmm0 -; X32-NEXT: pcmpestri $25, %xmm0, %xmm1 -; X32-NEXT: cmpl $8, %ecx -; X32-NEXT: jne .LBB11_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: jmp .LBB11_3 -; X32-NEXT: .LBB11_2: # %compare -; X32-NEXT: movdqa %xmm1, (%esp) -; X32-NEXT: addl %ecx, %ecx -; X32-NEXT: andl $14, %ecx -; X32-NEXT: movzwl (%esp,%ecx), %eax -; X32-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: subw 16(%esp,%ecx), %ax -; X32-NEXT: .LBB11_3: # %exit -; X32-NEXT: movzwl %ax, %eax -; X32-NEXT: leal -4(%ebp), %esp -; X32-NEXT: popl %esi -; X32-NEXT: popl %ebp -; X32-NEXT: retl -; -; X64-LABEL: pcmpestri_mem_diff_i16: -; X64: # %bb.0: # %entry -; X64-NEXT: movdqu (%rdi), %xmm1 -; X64-NEXT: movdqu (%rdx), %xmm0 -; X64-NEXT: movl %esi, %eax -; X64-NEXT: movl %ecx, %edx -; X64-NEXT: pcmpestri $25, %xmm0, %xmm1 -; X64-NEXT: # kill: def $ecx killed $ecx def $rcx -; X64-NEXT: cmpl $8, %ecx -; X64-NEXT: jne .LBB11_2 -; X64-NEXT: # %bb.1: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB11_2: # %compare -; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $7, %ecx -; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax -; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: subw -40(%rsp,%rcx,2), %ax -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: retq -entry: - %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* - %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 - %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* - %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 - %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> - %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> - %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) - %eq = icmp eq i32 %idx, 8 - br i1 %eq, label %exit, label %compare - -compare: - %lhs_c = extractelement <8 x i16> %lhs, i32 %idx - %rhs_c = extractelement <8 x i16> %rhs, i32 %idx - %sub = sub i16 %lhs_c, %rhs_c - br label %exit - -exit: - %result = phi i16 [ 0, %entry ], [ %sub, %compare ] - %result_ext = zext i16 %result to i32 - ret i32 %result_ext -} - -define i1 @pcmpistri_reg_eq_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { -; X32-LABEL: pcmpistri_reg_eq_i8: -; X32: # %bb.0: # %entry -; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X32-NEXT: setae %al -; X32-NEXT: retl -; -; X64-LABEL: pcmpistri_reg_eq_i8: -; X64: # %bb.0: # %entry -; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X64-NEXT: setae %al -; X64-NEXT: retq -entry: - %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) - %result = icmp eq i32 %c, 0 - ret i1 %result -} - -define i32 @pcmpistri_reg_idx_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { -; X32-LABEL: pcmpistri_reg_idx_i8: -; X32: # %bb.0: # %entry -; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: retl -; -; X64-LABEL: pcmpistri_reg_idx_i8: -; X64: # %bb.0: # %entry -; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: retq -entry: - %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) - ret i32 %idx -} - -define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { -; X32-LABEL: pcmpistri_reg_diff_i8: -; X32: # %bb.0: # %entry -; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X32-NEXT: cmpl $16, %ecx -; X32-NEXT: jne .LBB14_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: movzbl %al, %eax -; X32-NEXT: retl -; X32-NEXT: .LBB14_2: # %compare -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-16, %esp -; X32-NEXT: subl $48, %esp -; X32-NEXT: movdqa %xmm0, (%esp) -; X32-NEXT: andl $15, %ecx -; X32-NEXT: movb (%esp,%ecx), %al -; X32-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) -; X32-NEXT: subb 16(%esp,%ecx), %al -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp -; X32-NEXT: movzbl %al, %eax -; X32-NEXT: retl -; -; X64-LABEL: pcmpistri_reg_diff_i8: -; X64: # %bb.0: # %entry -; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X64-NEXT: # kill: def $ecx killed $ecx def $rcx -; X64-NEXT: cmpl $16, %ecx -; X64-NEXT: jne .LBB14_2 -; X64-NEXT: # %bb.1: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB14_2: # %compare -; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $15, %ecx -; X64-NEXT: movb -24(%rsp,%rcx), %al -; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: subb -40(%rsp,%rcx), %al -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: retq -entry: - %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) - %eq = icmp eq i32 %idx, 16 - br i1 %eq, label %exit, label %compare - -compare: - %lhs_c = extractelement <16 x i8> %lhs, i32 %idx - %rhs_c = extractelement <16 x i8> %rhs, i32 %idx - %sub = sub i8 %lhs_c, %rhs_c - br label %exit - -exit: - %result = phi i8 [ 0, %entry ], [ %sub, %compare ] - %result_ext = zext i8 %result to i32 - ret i32 %result_ext -} - -define i1 @pcmpistri_mem_eq_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { -; X32-LABEL: pcmpistri_mem_eq_i8: -; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movdqu (%ecx), %xmm0 -; X32-NEXT: pcmpistri $24, (%eax), %xmm0 -; X32-NEXT: setae %al -; X32-NEXT: retl -; -; X64-LABEL: pcmpistri_mem_eq_i8: -; X64: # %bb.0: # %entry -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: pcmpistri $24, (%rsi), %xmm0 -; X64-NEXT: setae %al -; X64-NEXT: retq -entry: - %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* - %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 - %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* - %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 - %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) - %result = icmp eq i32 %c, 0 - ret i1 %result -} - -define i32 @pcmpistri_mem_idx_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { -; X32-LABEL: pcmpistri_mem_idx_i8: -; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movdqu (%ecx), %xmm0 -; X32-NEXT: pcmpistri $24, (%eax), %xmm0 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: retl -; -; X64-LABEL: pcmpistri_mem_idx_i8: -; X64: # %bb.0: # %entry -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: pcmpistri $24, (%rsi), %xmm0 -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: retq -entry: - %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* - %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 - %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* - %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 - %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) - ret i32 %idx -} - -define i32 @pcmpistri_mem_diff_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { -; X32-LABEL: pcmpistri_mem_diff_i8: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-16, %esp -; X32-NEXT: subl $48, %esp -; X32-NEXT: movl 12(%ebp), %eax -; X32-NEXT: movl 8(%ebp), %ecx -; X32-NEXT: movdqu (%ecx), %xmm1 -; X32-NEXT: movdqu (%eax), %xmm0 -; X32-NEXT: pcmpistri $24, %xmm0, %xmm1 -; X32-NEXT: cmpl $16, %ecx -; X32-NEXT: jne .LBB17_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: jmp .LBB17_3 -; X32-NEXT: .LBB17_2: # %compare -; X32-NEXT: movdqa %xmm1, (%esp) -; X32-NEXT: andl $15, %ecx -; X32-NEXT: movb (%esp,%ecx), %al -; X32-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: subb 16(%esp,%ecx), %al -; X32-NEXT: .LBB17_3: # %exit -; X32-NEXT: movzbl %al, %eax -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp -; X32-NEXT: retl -; -; X64-LABEL: pcmpistri_mem_diff_i8: -; X64: # %bb.0: # %entry -; X64-NEXT: movdqu (%rdi), %xmm1 -; X64-NEXT: movdqu (%rsi), %xmm0 -; X64-NEXT: pcmpistri $24, %xmm0, %xmm1 -; X64-NEXT: # kill: def $ecx killed $ecx def $rcx -; X64-NEXT: cmpl $16, %ecx -; X64-NEXT: jne .LBB17_2 -; X64-NEXT: # %bb.1: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB17_2: # %compare -; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $15, %ecx -; X64-NEXT: movb -24(%rsp,%rcx), %al -; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: subb -40(%rsp,%rcx), %al -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: retq -entry: - %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* - %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 - %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* - %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 - %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) - %eq = icmp eq i32 %idx, 16 - br i1 %eq, label %exit, label %compare - -compare: - %lhs_c = extractelement <16 x i8> %lhs, i32 %idx - %rhs_c = extractelement <16 x i8> %rhs, i32 %idx - %sub = sub i8 %lhs_c, %rhs_c - br label %exit - -exit: - %result = phi i8 [ 0, %entry ], [ %sub, %compare ] - %result_ext = zext i8 %result to i32 - ret i32 %result_ext -} - -define i1 @pcmpistri_reg_eq_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { -; X32-LABEL: pcmpistri_reg_eq_i16: -; X32: # %bb.0: # %entry -; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X32-NEXT: setae %al -; X32-NEXT: retl -; -; X64-LABEL: pcmpistri_reg_eq_i16: -; X64: # %bb.0: # %entry -; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X64-NEXT: setae %al -; X64-NEXT: retq -entry: - %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> - %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> - %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) - %result = icmp eq i32 %c, 0 - ret i1 %result -} - -define i32 @pcmpistri_reg_idx_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { -; X32-LABEL: pcmpistri_reg_idx_i16: -; X32: # %bb.0: # %entry -; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: retl -; -; X64-LABEL: pcmpistri_reg_idx_i16: -; X64: # %bb.0: # %entry -; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: retq -entry: - %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> - %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> - %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) - ret i32 %idx -} - -define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { -; X32-LABEL: pcmpistri_reg_diff_i16: -; X32: # %bb.0: # %entry -; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X32-NEXT: cmpl $16, %ecx -; X32-NEXT: jne .LBB20_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: movzwl %ax, %eax -; X32-NEXT: retl -; X32-NEXT: .LBB20_2: # %compare -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-16, %esp -; X32-NEXT: subl $48, %esp -; X32-NEXT: movdqa %xmm0, (%esp) -; X32-NEXT: addl %ecx, %ecx -; X32-NEXT: andl $14, %ecx -; X32-NEXT: movzwl (%esp,%ecx), %eax -; X32-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) -; X32-NEXT: subw 16(%esp,%ecx), %ax -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp -; X32-NEXT: movzwl %ax, %eax -; X32-NEXT: retl -; -; X64-LABEL: pcmpistri_reg_diff_i16: -; X64: # %bb.0: # %entry -; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X64-NEXT: # kill: def $ecx killed $ecx def $rcx -; X64-NEXT: cmpl $16, %ecx -; X64-NEXT: jne .LBB20_2 -; X64-NEXT: # %bb.1: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB20_2: # %compare -; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $7, %ecx -; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax -; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: subw -40(%rsp,%rcx,2), %ax -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: retq -entry: - %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> - %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> - %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) - %eq = icmp eq i32 %idx, 16 - br i1 %eq, label %exit, label %compare - -compare: - %lhs_c = extractelement <8 x i16> %lhs, i32 %idx - %rhs_c = extractelement <8 x i16> %rhs, i32 %idx - %sub = sub i16 %lhs_c, %rhs_c - br label %exit - -exit: - %result = phi i16 [ 0, %entry ], [ %sub, %compare ] - %result_ext = zext i16 %result to i32 - ret i32 %result_ext -} - -define i1 @pcmpistri_mem_eq_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { -; X32-LABEL: pcmpistri_mem_eq_i16: -; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movdqu (%ecx), %xmm0 -; X32-NEXT: pcmpistri $25, (%eax), %xmm0 -; X32-NEXT: setae %al -; X32-NEXT: retl -; -; X64-LABEL: pcmpistri_mem_eq_i16: -; X64: # %bb.0: # %entry -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: pcmpistri $25, (%rsi), %xmm0 -; X64-NEXT: setae %al -; X64-NEXT: retq -entry: - %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* - %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 - %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* - %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 - %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> - %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> - %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) - %result = icmp eq i32 %c, 0 - ret i1 %result -} - -define i32 @pcmpistri_mem_idx_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { -; X32-LABEL: pcmpistri_mem_idx_i16: -; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movdqu (%ecx), %xmm0 -; X32-NEXT: pcmpistri $25, (%eax), %xmm0 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: retl -; -; X64-LABEL: pcmpistri_mem_idx_i16: -; X64: # %bb.0: # %entry -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: pcmpistri $25, (%rsi), %xmm0 -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: retq -entry: - %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* - %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 - %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* - %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 - %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> - %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> - %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) - ret i32 %idx -} - -define i32 @pcmpistri_mem_diff_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { -; X32-LABEL: pcmpistri_mem_diff_i16: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-16, %esp -; X32-NEXT: subl $48, %esp -; X32-NEXT: movl 12(%ebp), %eax -; X32-NEXT: movl 8(%ebp), %ecx -; X32-NEXT: movdqu (%ecx), %xmm1 -; X32-NEXT: movdqu (%eax), %xmm0 -; X32-NEXT: pcmpistri $25, %xmm0, %xmm1 -; X32-NEXT: cmpl $8, %ecx -; X32-NEXT: jne .LBB23_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: jmp .LBB23_3 -; X32-NEXT: .LBB23_2: # %compare -; X32-NEXT: movdqa %xmm1, (%esp) -; X32-NEXT: addl %ecx, %ecx -; X32-NEXT: andl $14, %ecx -; X32-NEXT: movzwl (%esp,%ecx), %eax -; X32-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: subw 16(%esp,%ecx), %ax -; X32-NEXT: .LBB23_3: # %exit -; X32-NEXT: movzwl %ax, %eax -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp -; X32-NEXT: retl -; -; X64-LABEL: pcmpistri_mem_diff_i16: -; X64: # %bb.0: # %entry -; X64-NEXT: movdqu (%rdi), %xmm1 -; X64-NEXT: movdqu (%rsi), %xmm0 -; X64-NEXT: pcmpistri $25, %xmm0, %xmm1 -; X64-NEXT: # kill: def $ecx killed $ecx def $rcx -; X64-NEXT: cmpl $8, %ecx -; X64-NEXT: jne .LBB23_2 -; X64-NEXT: # %bb.1: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB23_2: # %compare -; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $7, %ecx -; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax -; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: subw -40(%rsp,%rcx,2), %ax -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: retq -entry: - %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* - %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 - %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* - %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 - %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> - %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> - %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) - %eq = icmp eq i32 %idx, 8 - br i1 %eq, label %exit, label %compare - -compare: - %lhs_c = extractelement <8 x i16> %lhs, i32 %idx - %rhs_c = extractelement <8 x i16> %rhs, i32 %idx - %sub = sub i16 %lhs_c, %rhs_c - br label %exit - -exit: - %result = phi i16 [ 0, %entry ], [ %sub, %compare ] - %result_ext = zext i16 %result to i32 - ret i32 %result_ext -} Index: llvm/trunk/test/CodeGen/X86/sttni.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sttni.ll +++ llvm/trunk/test/CodeGen/X86/sttni.ll @@ -0,0 +1,1337 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=sse4.2 -mcpu=x86-64 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 -mcpu=x86-64 | FileCheck %s --check-prefix=X64 + +declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) +declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) +declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) +declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8) +declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8) +declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8>, i8) + +define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_eq_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_reg_eq_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpestri_reg_idx_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_idx_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_reg_idx_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + ret i32 %idx +} + +define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_diff_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: movl 12(%ebp), %edx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: cmpl $16, %ecx +; X32-NEXT: jne .LBB2_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: jmp .LBB2_3 +; X32-NEXT: .LBB2_2: # %compare +; X32-NEXT: movdqa %xmm0, (%esp) +; X32-NEXT: andl $15, %ecx +; X32-NEXT: movb (%esp,%ecx), %al +; X32-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: subb 16(%esp,%ecx), %al +; X32-NEXT: .LBB2_3: # %exit +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_reg_diff_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $16, %ecx +; X64-NEXT: jne .LBB2_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB2_2: # %compare +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $15, %ecx +; X64-NEXT: movb -24(%rsp,%rcx), %al +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: subb -40(%rsp,%rcx), %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %eq = icmp eq i32 %idx, 16 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <16 x i8> %lhs, i32 %idx + %rhs_c = extractelement <16 x i8> %rhs, i32 %idx + %sub = sub i8 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i8 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i8 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpestri_mem_eq_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_eq_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movdqu (%esi), %xmm0 +; X32-NEXT: pcmpestri $24, (%ecx), %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: popl %esi +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_mem_eq_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: pcmpestri $24, (%r8), %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* + %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* + %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 + %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpestri_mem_idx_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_idx_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movdqu (%esi), %xmm0 +; X32-NEXT: pcmpestri $24, (%ecx), %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: popl %esi +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_mem_idx_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: pcmpestri $24, (%r8), %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* + %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* + %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + ret i32 %idx +} + +define i32 @pcmpestri_mem_diff_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_diff_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: pushl %esi +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movl 12(%ebp), %eax +; X32-NEXT: movl 20(%ebp), %edx +; X32-NEXT: movl 16(%ebp), %ecx +; X32-NEXT: movl 8(%ebp), %esi +; X32-NEXT: movdqu (%esi), %xmm1 +; X32-NEXT: movdqu (%ecx), %xmm0 +; X32-NEXT: pcmpestri $24, %xmm0, %xmm1 +; X32-NEXT: cmpl $16, %ecx +; X32-NEXT: jne .LBB5_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: jmp .LBB5_3 +; X32-NEXT: .LBB5_2: # %compare +; X32-NEXT: movdqa %xmm1, (%esp) +; X32-NEXT: andl $15, %ecx +; X32-NEXT: movb (%esp,%ecx), %al +; X32-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: subb 16(%esp,%ecx), %al +; X32-NEXT: .LBB5_3: # %exit +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: leal -4(%ebp), %esp +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_mem_diff_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm1 +; X64-NEXT: movdqu (%rdx), %xmm0 +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: pcmpestri $24, %xmm0, %xmm1 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $16, %ecx +; X64-NEXT: jne .LBB5_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB5_2: # %compare +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $15, %ecx +; X64-NEXT: movb -24(%rsp,%rcx), %al +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: subb -40(%rsp,%rcx), %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* + %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* + %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %eq = icmp eq i32 %idx, 16 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <16 x i8> %lhs, i32 %idx + %rhs_c = extractelement <16 x i8> %rhs, i32 %idx + %sub = sub i8 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i8 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i8 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpestri_reg_eq_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_eq_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_reg_eq_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpestri_reg_idx_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_idx_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_reg_idx_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) + ret i32 %idx +} + +define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_diff_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: movl 12(%ebp), %edx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: cmpl $16, %ecx +; X32-NEXT: jne .LBB8_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: jmp .LBB8_3 +; X32-NEXT: .LBB8_2: # %compare +; X32-NEXT: movdqa %xmm0, (%esp) +; X32-NEXT: addl %ecx, %ecx +; X32-NEXT: andl $14, %ecx +; X32-NEXT: movzwl (%esp,%ecx), %eax +; X32-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: subw 16(%esp,%ecx), %ax +; X32-NEXT: .LBB8_3: # %exit +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_reg_diff_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $16, %ecx +; X64-NEXT: jne .LBB8_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB8_2: # %compare +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $7, %ecx +; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: subw -40(%rsp,%rcx,2), %ax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +entry: + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) + %eq = icmp eq i32 %idx, 16 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <8 x i16> %lhs, i32 %idx + %rhs_c = extractelement <8 x i16> %rhs, i32 %idx + %sub = sub i16 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i16 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i16 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpestri_mem_eq_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_eq_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movdqu (%esi), %xmm0 +; X32-NEXT: pcmpestri $25, (%ecx), %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: popl %esi +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_mem_eq_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: pcmpestri $25, (%r8), %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* + %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* + %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpestri_mem_idx_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_idx_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movdqu (%esi), %xmm0 +; X32-NEXT: pcmpestri $25, (%ecx), %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: popl %esi +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_mem_idx_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: pcmpestri $25, (%r8), %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* + %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* + %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) + ret i32 %idx +} + +define i32 @pcmpestri_mem_diff_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_diff_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: pushl %esi +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movl 12(%ebp), %eax +; X32-NEXT: movl 20(%ebp), %edx +; X32-NEXT: movl 16(%ebp), %ecx +; X32-NEXT: movl 8(%ebp), %esi +; X32-NEXT: movdqu (%esi), %xmm1 +; X32-NEXT: movdqu (%ecx), %xmm0 +; X32-NEXT: pcmpestri $25, %xmm0, %xmm1 +; X32-NEXT: cmpl $8, %ecx +; X32-NEXT: jne .LBB11_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: jmp .LBB11_3 +; X32-NEXT: .LBB11_2: # %compare +; X32-NEXT: movdqa %xmm1, (%esp) +; X32-NEXT: addl %ecx, %ecx +; X32-NEXT: andl $14, %ecx +; X32-NEXT: movzwl (%esp,%ecx), %eax +; X32-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: subw 16(%esp,%ecx), %ax +; X32-NEXT: .LBB11_3: # %exit +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: leal -4(%ebp), %esp +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_mem_diff_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm1 +; X64-NEXT: movdqu (%rdx), %xmm0 +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: pcmpestri $25, %xmm0, %xmm1 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $8, %ecx +; X64-NEXT: jne .LBB11_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB11_2: # %compare +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $7, %ecx +; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: subw -40(%rsp,%rcx,2), %ax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* + %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* + %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) + %eq = icmp eq i32 %idx, 8 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <8 x i16> %lhs, i32 %idx + %rhs_c = extractelement <8 x i16> %rhs, i32 %idx + %sub = sub i16 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i16 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i16 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpistri_reg_eq_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_eq_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_reg_eq_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpistri_reg_idx_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_idx_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_reg_idx_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + ret i32 %idx +} + +define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_diff_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: cmpl $16, %ecx +; X32-NEXT: jne .LBB14_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; X32-NEXT: .LBB14_2: # %compare +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movdqa %xmm0, (%esp) +; X32-NEXT: andl $15, %ecx +; X32-NEXT: movb (%esp,%ecx), %al +; X32-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: subb 16(%esp,%ecx), %al +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_reg_diff_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $16, %ecx +; X64-NEXT: jne .LBB14_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB14_2: # %compare +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $15, %ecx +; X64-NEXT: movb -24(%rsp,%rcx), %al +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: subb -40(%rsp,%rcx), %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %eq = icmp eq i32 %idx, 16 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <16 x i8> %lhs, i32 %idx + %rhs_c = extractelement <16 x i8> %rhs, i32 %idx + %sub = sub i8 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i8 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i8 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpistri_mem_eq_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_eq_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm0 +; X32-NEXT: pcmpistri $24, (%eax), %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_mem_eq_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: pcmpistri $24, (%rsi), %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* + %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* + %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 + %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpistri_mem_idx_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_idx_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm0 +; X32-NEXT: pcmpistri $24, (%eax), %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_mem_idx_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: pcmpistri $24, (%rsi), %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* + %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* + %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + ret i32 %idx +} + +define i32 @pcmpistri_mem_diff_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_diff_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movl 12(%ebp), %eax +; X32-NEXT: movl 8(%ebp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm1 +; X32-NEXT: movdqu (%eax), %xmm0 +; X32-NEXT: pcmpistri $24, %xmm0, %xmm1 +; X32-NEXT: cmpl $16, %ecx +; X32-NEXT: jne .LBB17_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: jmp .LBB17_3 +; X32-NEXT: .LBB17_2: # %compare +; X32-NEXT: movdqa %xmm1, (%esp) +; X32-NEXT: andl $15, %ecx +; X32-NEXT: movb (%esp,%ecx), %al +; X32-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: subb 16(%esp,%ecx), %al +; X32-NEXT: .LBB17_3: # %exit +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_mem_diff_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm1 +; X64-NEXT: movdqu (%rsi), %xmm0 +; X64-NEXT: pcmpistri $24, %xmm0, %xmm1 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $16, %ecx +; X64-NEXT: jne .LBB17_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB17_2: # %compare +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $15, %ecx +; X64-NEXT: movb -24(%rsp,%rcx), %al +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: subb -40(%rsp,%rcx), %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* + %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* + %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %eq = icmp eq i32 %idx, 16 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <16 x i8> %lhs, i32 %idx + %rhs_c = extractelement <16 x i8> %rhs, i32 %idx + %sub = sub i8 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i8 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i8 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpistri_reg_eq_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_eq_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_reg_eq_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpistri_reg_idx_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_idx_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_reg_idx_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) + ret i32 %idx +} + +define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_diff_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: cmpl $16, %ecx +; X32-NEXT: jne .LBB20_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: retl +; X32-NEXT: .LBB20_2: # %compare +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movdqa %xmm0, (%esp) +; X32-NEXT: addl %ecx, %ecx +; X32-NEXT: andl $14, %ecx +; X32-NEXT: movzwl (%esp,%ecx), %eax +; X32-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: subw 16(%esp,%ecx), %ax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_reg_diff_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $16, %ecx +; X64-NEXT: jne .LBB20_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB20_2: # %compare +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $7, %ecx +; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: subw -40(%rsp,%rcx,2), %ax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +entry: + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) + %eq = icmp eq i32 %idx, 16 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <8 x i16> %lhs, i32 %idx + %rhs_c = extractelement <8 x i16> %rhs, i32 %idx + %sub = sub i16 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i16 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i16 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpistri_mem_eq_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_eq_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm0 +; X32-NEXT: pcmpistri $25, (%eax), %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_mem_eq_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: pcmpistri $25, (%rsi), %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* + %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* + %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpistri_mem_idx_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_idx_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm0 +; X32-NEXT: pcmpistri $25, (%eax), %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_mem_idx_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: pcmpistri $25, (%rsi), %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* + %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* + %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) + ret i32 %idx +} + +define i32 @pcmpistri_mem_diff_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_diff_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movl 12(%ebp), %eax +; X32-NEXT: movl 8(%ebp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm1 +; X32-NEXT: movdqu (%eax), %xmm0 +; X32-NEXT: pcmpistri $25, %xmm0, %xmm1 +; X32-NEXT: cmpl $8, %ecx +; X32-NEXT: jne .LBB23_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: jmp .LBB23_3 +; X32-NEXT: .LBB23_2: # %compare +; X32-NEXT: movdqa %xmm1, (%esp) +; X32-NEXT: addl %ecx, %ecx +; X32-NEXT: andl $14, %ecx +; X32-NEXT: movzwl (%esp,%ecx), %eax +; X32-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: subw 16(%esp,%ecx), %ax +; X32-NEXT: .LBB23_3: # %exit +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_mem_diff_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm1 +; X64-NEXT: movdqu (%rsi), %xmm0 +; X64-NEXT: pcmpistri $25, %xmm0, %xmm1 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $8, %ecx +; X64-NEXT: jne .LBB23_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB23_2: # %compare +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $7, %ecx +; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: subw -40(%rsp,%rcx,2), %ax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* + %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* + %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) + %eq = icmp eq i32 %idx, 8 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <8 x i16> %lhs, i32 %idx + %rhs_c = extractelement <8 x i16> %rhs, i32 %idx + %sub = sub i16 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i16 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i16 %result to i32 + ret i32 %result_ext +} + +define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpestr_index_flag: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: setb %bl +; X32-NEXT: movl %ecx, (%edi) +; X32-NEXT: movl %ebx, (%esi) +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx +; X32-NEXT: retl +; +; X64-LABEL: pcmpestr_index_flag: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: xorl %r10d, %r10d +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: setb %r10b +; X64-NEXT: movl %ecx, (%r9) +; X64-NEXT: movl %r10d, (%r8) +; X64-NEXT: retq +entry: + %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + store i32 %index, i32* %iptr + store i32 %flag, i32* %fptr + ret void +} + +define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpestr_mask_flag: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: pcmpestrm $24, %xmm1, %xmm0 +; X32-NEXT: setb %bl +; X32-NEXT: movdqa %xmm0, (%esi) +; X32-NEXT: movl %ebx, (%ecx) +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebx +; X32-NEXT: retl +; +; X64-LABEL: pcmpestr_mask_flag: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: xorl %r9d, %r9d +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0 +; X64-NEXT: setb %r9b +; X64-NEXT: movdqa %xmm0, (%r8) +; X64-NEXT: movl %r9d, (%rcx) +; X64-NEXT: retq +entry: + %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %flag, i32* %fptr + ret void +} + +define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr) nounwind { +; X32-LABEL: pcmpestr_mask_index: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movdqa %xmm0, %xmm2 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: pcmpestrm $24, %xmm1, %xmm0 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: pcmpestri $24, %xmm1, %xmm2 +; X32-NEXT: movdqa %xmm0, (%edi) +; X32-NEXT: movl %ecx, (%esi) +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: retl +; +; X64-LABEL: pcmpestr_mask_index: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0 +; X64-NEXT: pcmpestri $24, %xmm1, %xmm2 +; X64-NEXT: movdqa %xmm0, (%r9) +; X64-NEXT: movl %ecx, (%r8) +; X64-NEXT: retq +entry: + %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %index, i32* %iptr + ret void +} + +define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpestr_mask_index_flag: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movdqa %xmm0, %xmm2 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: pcmpestrm $24, %xmm1, %xmm0 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm2 +; X32-NEXT: setb %bl +; X32-NEXT: movdqa %xmm0, (%ebp) +; X32-NEXT: movl %ecx, (%edi) +; X32-NEXT: movl %ebx, (%esi) +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpestr_mask_index_flag: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rcx, %r9 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0 +; X64-NEXT: xorl %esi, %esi +; X64-NEXT: pcmpestri $24, %xmm1, %xmm2 +; X64-NEXT: setb %sil +; X64-NEXT: movdqa %xmm0, (%r10) +; X64-NEXT: movl %ecx, (%r9) +; X64-NEXT: movl %esi, (%r8) +; X64-NEXT: retq +entry: + %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %index, i32* %iptr + store i32 %flag, i32* %fptr + ret void +} + +define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpistr_index_flag: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: setb %dl +; X32-NEXT: movl %ecx, (%esi) +; X32-NEXT: movl %edx, (%eax) +; X32-NEXT: popl %esi +; X32-NEXT: retl +; +; X64-LABEL: pcmpistr_index_flag: +; X64: # %bb.0: # %entry +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: setb %al +; X64-NEXT: movl %ecx, (%rdi) +; X64-NEXT: movl %eax, (%rsi) +; X64-NEXT: retq +entry: + %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + store i32 %index, i32* %iptr + store i32 %flag, i32* %fptr + ret void +} + +define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpistr_mask_flag: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: pcmpistrm $24, %xmm1, %xmm0 +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: setb %cl +; X32-NEXT: movdqa %xmm0, (%edx) +; X32-NEXT: movl %ecx, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: pcmpistr_mask_flag: +; X64: # %bb.0: # %entry +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0 +; X64-NEXT: setb %al +; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: movl %eax, (%rsi) +; X64-NEXT: retq +entry: + %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %flag, i32* %fptr + ret void +} + +define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr) nounwind { +; X32-LABEL: pcmpistr_mask_index: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: pcmpistrm $24, %xmm1, %xmm0 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movdqa %xmm0, (%edx) +; X32-NEXT: movl %ecx, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: pcmpistr_mask_index: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0 +; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: movl %ecx, (%rsi) +; X64-NEXT: retq +entry: + %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %index, i32* %iptr + ret void +} + +define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpistr_mask_index_flag: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %esi +; X32-NEXT: movdqa %xmm0, %xmm2 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: pcmpistrm $24, %xmm1, %xmm0 +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: pcmpistri $24, %xmm1, %xmm2 +; X32-NEXT: setb %bl +; X32-NEXT: movdqa %xmm0, (%esi) +; X32-NEXT: movl %ecx, (%edx) +; X32-NEXT: movl %ebx, (%eax) +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebx +; X32-NEXT: retl +; +; X64-LABEL: pcmpistr_mask_index_flag: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: pcmpistri $24, %xmm1, %xmm2 +; X64-NEXT: setb %al +; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: movl %ecx, (%rsi) +; X64-NEXT: movl %eax, (%rdx) +; X64-NEXT: retq +entry: + %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %index, i32* %iptr + store i32 %flag, i32* %fptr + ret void +} + +; Make sure we don't fold loads when we need to emit pcmpistrm and pcmpistri. +define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, <16 x i8>* %rhsptr, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpistr_mask_index_flag_load: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %esi +; X32-NEXT: movdqa %xmm0, %xmm1 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm2 +; X32-NEXT: pcmpistrm $24, %xmm2, %xmm0 +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: pcmpistri $24, %xmm2, %xmm1 +; X32-NEXT: setb %bl +; X32-NEXT: movdqa %xmm0, (%esi) +; X32-NEXT: movl %ecx, (%edx) +; X32-NEXT: movl %ebx, (%eax) +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebx +; X32-NEXT: retl +; +; X64-LABEL: pcmpistr_mask_index_flag_load: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: movdqu (%rdi), %xmm2 +; X64-NEXT: pcmpistrm $24, %xmm2, %xmm0 +; X64-NEXT: xorl %edi, %edi +; X64-NEXT: pcmpistri $24, %xmm2, %xmm1 +; X64-NEXT: setb %dil +; X64-NEXT: movdqa %xmm0, (%rsi) +; X64-NEXT: movl %ecx, (%rdx) +; X64-NEXT: movl %edi, (%rax) +; X64-NEXT: retq +entry: + %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 1 + %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %index, i32* %iptr + store i32 %flag, i32* %fptr + ret void +} + +; Make sure we don't fold nontemporal loads. +define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, <16 x i8>* %rhsptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_nontemporal: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movntdqa (%ecx), %xmm1 +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: setb %bl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: popl %ebx +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_nontemporal: +; X64: # %bb.0: # %entry +; X64-NEXT: movntdqa (%rsi), %xmm1 +; X64-NEXT: xorl %esi, %esi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: setb %sil +; X64-NEXT: movl %esi, %eax +; X64-NEXT: retq +entry: + %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 16, !nontemporal !0 + %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + ret i32 %flag +} + +!0 = !{ i32 1 }