Index: llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp =================================================================== --- llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -21,6 +21,92 @@ using namespace llvm; +/// \brief Extracts the src/dst types for a given zero extension instruction. +/// \note While the number of elements in DstVT type correct, the +/// number in the SrcVT type is expanded to fill the src xmm register and the +/// upper elements may not be included in the dst xmm/ymm register. +static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) { + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unknown zero extension instruction"); + // i8 zero extension + case X86::PMOVZXBWrm: + case X86::PMOVZXBWrr: + case X86::VPMOVZXBWrm: + case X86::VPMOVZXBWrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v8i16; + break; + case X86::VPMOVZXBWYrm: + case X86::VPMOVZXBWYrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v16i16; + break; + case X86::PMOVZXBDrm: + case X86::PMOVZXBDrr: + case X86::VPMOVZXBDrm: + case X86::VPMOVZXBDrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v4i32; + break; + case X86::VPMOVZXBDYrm: + case X86::VPMOVZXBDYrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v8i32; + break; + case X86::PMOVZXBQrm: + case X86::PMOVZXBQrr: + case X86::VPMOVZXBQrm: + case X86::VPMOVZXBQrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v2i64; + break; + case X86::VPMOVZXBQYrm: + case X86::VPMOVZXBQYrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v4i64; + break; + // i16 zero extension + case X86::PMOVZXWDrm: + case X86::PMOVZXWDrr: + case X86::VPMOVZXWDrm: + case X86::VPMOVZXWDrr: + SrcVT = MVT::v8i16; + DstVT = MVT::v4i32; + break; + case X86::VPMOVZXWDYrm: + case X86::VPMOVZXWDYrr: + SrcVT = MVT::v8i16; + DstVT = MVT::v8i32; + break; + case X86::PMOVZXWQrm: + case X86::PMOVZXWQrr: + case X86::VPMOVZXWQrm: + case X86::VPMOVZXWQrr: + SrcVT = MVT::v8i16; + DstVT = MVT::v2i64; + break; + case X86::VPMOVZXWQYrm: + case X86::VPMOVZXWQYrr: + SrcVT = MVT::v8i16; + DstVT = MVT::v4i64; + break; + // i32 zero extension + case X86::PMOVZXDQrm: + case X86::PMOVZXDQrr: + case X86::VPMOVZXDQrm: + case X86::VPMOVZXDQrr: + SrcVT = MVT::v4i32; + DstVT = MVT::v2i64; + break; + case X86::VPMOVZXDQYrm: + case X86::VPMOVZXDQYrr: + SrcVT = MVT::v4i32; + DstVT = MVT::v4i64; + break; + } +} + //===----------------------------------------------------------------------===// // Top Level Entrypoint //===----------------------------------------------------------------------===// @@ -750,6 +836,92 @@ ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; + + case X86::MOVSDrr: + case X86::VMOVSDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVSDrm: + case X86::VMOVSDrm: + DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::MOVSSrr: + case X86::VMOVSSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVSSrm: + case X86::VMOVSSrm: + DecodeScalarMoveMask(MVT::v4f32, nullptr == Src2Name, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::MOVPQI2QIrr: + case X86::MOVZPQILo2PQIrr: + case X86::VMOVPQI2QIrr: + case X86::VMOVZPQILo2PQIrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVQI2PQIrm: + case X86::MOVZQI2PQIrm: + case X86::MOVZPQILo2PQIrm: + case X86::VMOVQI2PQIrm: + case X86::VMOVZQI2PQIrm: + case X86::VMOVZPQILo2PQIrm: + DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::MOVDI2PDIrm: + case X86::VMOVDI2PDIrm: + DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::PMOVZXBWrr: + case X86::PMOVZXBDrr: + case X86::PMOVZXBQrr: + case X86::PMOVZXWDrr: + case X86::PMOVZXWQrr: + case X86::PMOVZXDQrr: + case X86::VPMOVZXBWrr: + case X86::VPMOVZXBDrr: + case X86::VPMOVZXBQrr: + case X86::VPMOVZXWDrr: + case X86::VPMOVZXWQrr: + case X86::VPMOVZXDQrr: + case X86::VPMOVZXBWYrr: + case X86::VPMOVZXBDYrr: + case X86::VPMOVZXBQYrr: + case X86::VPMOVZXWDYrr: + case X86::VPMOVZXWQYrr: + case X86::VPMOVZXDQYrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::PMOVZXBWrm: + case X86::PMOVZXBDrm: + case X86::PMOVZXBQrm: + case X86::PMOVZXWDrm: + case X86::PMOVZXWQrm: + case X86::PMOVZXDQrm: + case X86::VPMOVZXBWrm: + case X86::VPMOVZXBDrm: + case X86::VPMOVZXBQrm: + case X86::VPMOVZXWDrm: + case X86::VPMOVZXWQrm: + case X86::VPMOVZXDQrm: + case X86::VPMOVZXBWYrm: + case X86::VPMOVZXBDYrm: + case X86::VPMOVZXBQYrm: + case X86::VPMOVZXWDYrm: + case X86::VPMOVZXWQYrm: + case X86::VPMOVZXDQYrm: { + MVT SrcVT, DstVT; + getZeroExtensionTypes(MI, SrcVT, DstVT); + DecodeZeroExtendMask(SrcVT, DstVT, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + } break; } // The only comments we decode are shuffles, so give up if we were unable to Index: llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h =================================================================== --- llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h +++ llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -87,9 +87,19 @@ /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl &ShuffleMask); -/// \brief Decode a VPERMILP variable mask from an IR-level vector constant. -void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl &ShuffleMask); - -} // llvm namespace - -#endif +/// \brief Decode a VPERMILP variable mask from an IR-level vector constant. +void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl &ShuffleMask); + +/// \brief Decode a zero extension instruction as a shuffle mask. +void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, + SmallVectorImpl &ShuffleMask); + +/// \brief Decode a move lower and zero upper instruction as a shuffle mask. +void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl &ShuffleMask); + +/// \brief Decode a scalar float move instruction as a shuffle mask. +void DecodeScalarMoveMask(MVT VT, bool IsLoad, + SmallVectorImpl &ShuffleMask); +} // llvm namespace + +#endif Index: llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp =================================================================== --- llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -1,81 +1,81 @@ -//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Define several functions to decode x86 specific shuffle semantics into a -// generic vector mask. -// -//===----------------------------------------------------------------------===// - -#include "X86ShuffleDecode.h" -#include "llvm/IR/Constants.h" -#include "llvm/CodeGen/MachineValueType.h" - -//===----------------------------------------------------------------------===// -// Vector Mask Decoding -//===----------------------------------------------------------------------===// - -namespace llvm { - -void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl &ShuffleMask) { - // Defaults the copying the dest value. - ShuffleMask.push_back(0); - ShuffleMask.push_back(1); - ShuffleMask.push_back(2); - ShuffleMask.push_back(3); - - // Decode the immediate. - unsigned ZMask = Imm & 15; - unsigned CountD = (Imm >> 4) & 3; - unsigned CountS = (Imm >> 6) & 3; - - // CountS selects which input element to use. - unsigned InVal = 4+CountS; - // CountD specifies which element of destination to update. - ShuffleMask[CountD] = InVal; - // ZMask zaps values, potentially overriding the CountD elt. - if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero; - if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero; - if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero; - if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero; -} - -// <3,1> or <6,7,2,3> -void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl &ShuffleMask) { - for (unsigned i = NElts/2; i != NElts; ++i) - ShuffleMask.push_back(NElts+i); - - for (unsigned i = NElts/2; i != NElts; ++i) - ShuffleMask.push_back(i); -} - -// <0,2> or <0,1,4,5> -void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl &ShuffleMask) { - for (unsigned i = 0; i != NElts/2; ++i) - ShuffleMask.push_back(i); - - for (unsigned i = 0; i != NElts/2; ++i) - ShuffleMask.push_back(NElts+i); -} - -void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - for (int i = 0, e = NumElts / 2; i < e; ++i) { - ShuffleMask.push_back(2 * i); - ShuffleMask.push_back(2 * i); - } -} - -void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - for (int i = 0, e = NumElts / 2; i < e; ++i) { - ShuffleMask.push_back(2 * i + 1); - ShuffleMask.push_back(2 * i + 1); +//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics into a +// generic vector mask. +// +//===----------------------------------------------------------------------===// + +#include "X86ShuffleDecode.h" +#include "llvm/IR/Constants.h" +#include "llvm/CodeGen/MachineValueType.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { + +void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl &ShuffleMask) { + // Defaults the copying the dest value. + ShuffleMask.push_back(0); + ShuffleMask.push_back(1); + ShuffleMask.push_back(2); + ShuffleMask.push_back(3); + + // Decode the immediate. + unsigned ZMask = Imm & 15; + unsigned CountD = (Imm >> 4) & 3; + unsigned CountS = (Imm >> 6) & 3; + + // CountS selects which input element to use. + unsigned InVal = 4+CountS; + // CountD specifies which element of destination to update. + ShuffleMask[CountD] = InVal; + // ZMask zaps values, potentially overriding the CountD elt. + if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero; + if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero; + if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero; + if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero; +} + +// <3,1> or <6,7,2,3> +void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl &ShuffleMask) { + for (unsigned i = NElts/2; i != NElts; ++i) + ShuffleMask.push_back(NElts+i); + + for (unsigned i = NElts/2; i != NElts; ++i) + ShuffleMask.push_back(i); +} + +// <0,2> or <0,1,4,5> +void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl &ShuffleMask) { + for (unsigned i = 0; i != NElts/2; ++i) + ShuffleMask.push_back(i); + + for (unsigned i = 0; i != NElts/2; ++i) + ShuffleMask.push_back(NElts+i); +} + +void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + for (int i = 0, e = NumElts / 2; i < e; ++i) { + ShuffleMask.push_back(2 * i); + ShuffleMask.push_back(2 * i); + } +} + +void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + for (int i = 0, e = NumElts / 2; i < e; ++i) { + ShuffleMask.push_back(2 * i + 1); + ShuffleMask.push_back(2 * i + 1); } } @@ -96,307 +96,339 @@ void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { unsigned VectorSizeInBits = VT.getSizeInBits(); unsigned NumElts = VectorSizeInBits / 8; - unsigned NumLanes = VectorSizeInBits / 128; - unsigned NumLaneElts = NumElts / NumLanes; - - for (unsigned l = 0; l < NumElts; l += NumLaneElts) - for (unsigned i = 0; i < NumLaneElts; ++i) { - int M = SM_SentinelZero; - if (i >= Imm) M = i - Imm + l; - ShuffleMask.push_back(M); - } -} - -void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { - unsigned VectorSizeInBits = VT.getSizeInBits(); - unsigned NumElts = VectorSizeInBits / 8; - unsigned NumLanes = VectorSizeInBits / 128; - unsigned NumLaneElts = NumElts / NumLanes; - - for (unsigned l = 0; l < NumElts; l += NumLaneElts) - for (unsigned i = 0; i < NumLaneElts; ++i) { - unsigned Base = i + Imm; - int M = Base + l; - if (Base >= NumLaneElts) M = SM_SentinelZero; - ShuffleMask.push_back(M); - } -} - -void DecodePALIGNRMask(MVT VT, unsigned Imm, - SmallVectorImpl &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8); - - unsigned NumLanes = VT.getSizeInBits() / 128; - unsigned NumLaneElts = NumElts / NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0; i != NumLaneElts; ++i) { - unsigned Base = i + Offset; - // if i+offset is out of this lane then we actually need the other source - if (Base >= NumLaneElts) Base += NumElts - NumLaneElts; - ShuffleMask.push_back(Base + l); - } - } -} - -/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*. -/// VT indicates the type of the vector allowing it to handle different -/// datatypes and vector widths. -void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - - unsigned NumLanes = VT.getSizeInBits() / 128; - unsigned NumLaneElts = NumElts / NumLanes; - - unsigned NewImm = Imm; - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0; i != NumLaneElts; ++i) { - ShuffleMask.push_back(NewImm % NumLaneElts + l); - NewImm /= NumLaneElts; - } - if (NumLaneElts == 4) NewImm = Imm; // reload imm - } -} - -void DecodePSHUFHWMask(MVT VT, unsigned Imm, - SmallVectorImpl &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - - for (unsigned l = 0; l != NumElts; l += 8) { - unsigned NewImm = Imm; - for (unsigned i = 0, e = 4; i != e; ++i) { - ShuffleMask.push_back(l + i); - } - for (unsigned i = 4, e = 8; i != e; ++i) { - ShuffleMask.push_back(l + 4 + (NewImm & 3)); - NewImm >>= 2; - } - } -} - -void DecodePSHUFLWMask(MVT VT, unsigned Imm, - SmallVectorImpl &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - - for (unsigned l = 0; l != NumElts; l += 8) { - unsigned NewImm = Imm; - for (unsigned i = 0, e = 4; i != e; ++i) { - ShuffleMask.push_back(l + (NewImm & 3)); - NewImm >>= 2; - } - for (unsigned i = 4, e = 8; i != e; ++i) { - ShuffleMask.push_back(l + i); - } - } -} - -/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates -/// the type of the vector allowing it to handle different datatypes and vector -/// widths. -void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - - unsigned NumLanes = VT.getSizeInBits() / 128; - unsigned NumLaneElts = NumElts / NumLanes; - - unsigned NewImm = Imm; - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - // each half of a lane comes from different source - for (unsigned s = 0; s != NumElts*2; s += NumElts) { - for (unsigned i = 0; i != NumLaneElts/2; ++i) { - ShuffleMask.push_back(NewImm % NumLaneElts + s + l); - NewImm /= NumLaneElts; - } - } - if (NumLaneElts == 4) NewImm = Imm; // reload imm - } -} - -/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd -/// and punpckh*. VT indicates the type of the vector allowing it to handle -/// different datatypes and vector widths. -void DecodeUNPCKHMask(MVT VT, SmallVectorImpl &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits() / 128; - if (NumLanes == 0 ) NumLanes = 1; // Handle MMX - unsigned NumLaneElts = NumElts / NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) { - ShuffleMask.push_back(i); // Reads from dest/src1 - ShuffleMask.push_back(i+NumElts); // Reads from src/src2 - } - } -} - -/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd -/// and punpckl*. VT indicates the type of the vector allowing it to handle -/// different datatypes and vector widths. -void DecodeUNPCKLMask(MVT VT, SmallVectorImpl &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits() / 128; - if (NumLanes == 0 ) NumLanes = 1; // Handle MMX - unsigned NumLaneElts = NumElts / NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) { - ShuffleMask.push_back(i); // Reads from dest/src1 - ShuffleMask.push_back(i+NumElts); // Reads from src/src2 - } - } -} - -void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, - SmallVectorImpl &ShuffleMask) { - if (Imm & 0x88) - return; // Not a shuffle - - unsigned HalfSize = VT.getVectorNumElements()/2; - - for (unsigned l = 0; l != 2; ++l) { - unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize; - for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i) - ShuffleMask.push_back(i); - } -} - -void DecodePSHUFBMask(const Constant *C, SmallVectorImpl &ShuffleMask) { - Type *MaskTy = C->getType(); - // It is not an error for the PSHUFB mask to not be a vector of i8 because the - // constant pool uniques constants by their bit representation. - // e.g. the following take up the same space in the constant pool: - // i128 -170141183420855150465331762880109871104 - // - // <2 x i64> - // - // <4 x i32> - - unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); - - if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512. - return; - - // This is a straightforward byte vector. - if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) { - int NumElements = MaskTy->getVectorNumElements(); - ShuffleMask.reserve(NumElements); - - for (int i = 0; i < NumElements; ++i) { - // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte - // lane of the vector we're inside. - int Base = i < 16 ? 0 : 16; - Constant *COp = C->getAggregateElement(i); - if (!COp) { - ShuffleMask.clear(); - return; - } else if (isa(COp)) { - ShuffleMask.push_back(SM_SentinelUndef); - continue; - } - uint64_t Element = cast(COp)->getZExtValue(); - // If the high bit (7) of the byte is set, the element is zeroed. - if (Element & (1 << 7)) - ShuffleMask.push_back(SM_SentinelZero); - else { - // Only the least significant 4 bits of the byte are used. - int Index = Base + (Element & 0xf); - ShuffleMask.push_back(Index); - } - } - } - // TODO: Handle funny-looking vectors too. -} - -void DecodePSHUFBMask(ArrayRef RawMask, - SmallVectorImpl &ShuffleMask) { - for (int i = 0, e = RawMask.size(); i < e; ++i) { - uint64_t M = RawMask[i]; - if (M == (uint64_t)SM_SentinelUndef) { - ShuffleMask.push_back(M); - continue; - } - // For AVX vectors with 32 bytes the base of the shuffle is the half of - // the vector we're inside. - int Base = i < 16 ? 0 : 16; - // If the high bit (7) of the byte is set, the element is zeroed. - if (M & (1 << 7)) - ShuffleMask.push_back(SM_SentinelZero); - else { - // Only the least significant 4 bits of the byte are used. - int Index = Base + (M & 0xf); - ShuffleMask.push_back(Index); - } - } -} - -void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { - int ElementBits = VT.getScalarSizeInBits(); - int NumElements = VT.getVectorNumElements(); - for (int i = 0; i < NumElements; ++i) { - // If there are more than 8 elements in the vector, then any immediate blend - // mask applies to each 128-bit lane. There can never be more than - // 8 elements in a 128-bit lane with an immediate blend. - int Bit = NumElements > 8 ? i % (128 / ElementBits) : i; - assert(Bit < 8 && - "Immediate blends only operate over 8 elements at a time!"); - ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i); - } -} - -/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. -/// No VT provided since it only works on 256-bit, 4 element vectors. -void DecodeVPERMMask(unsigned Imm, SmallVectorImpl &ShuffleMask) { - for (unsigned i = 0; i != 4; ++i) { - ShuffleMask.push_back((Imm >> (2*i)) & 3); - } -} - -void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl &ShuffleMask) { - Type *MaskTy = C->getType(); - assert(MaskTy->isVectorTy() && "Expected a vector constant mask!"); - assert(MaskTy->getVectorElementType()->isIntegerTy() && - "Expected integer constant mask elements!"); - int ElementBits = MaskTy->getScalarSizeInBits(); - int NumElements = MaskTy->getVectorNumElements(); - assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && - "Unexpected number of vector elements."); - ShuffleMask.reserve(NumElements); - if (auto *CDS = dyn_cast(C)) { - assert((unsigned)NumElements == CDS->getNumElements() && - "Constant mask has a different number of elements!"); - - for (int i = 0; i < NumElements; ++i) { - int Base = (i * ElementBits / 128) * (128 / ElementBits); - uint64_t Element = CDS->getElementAsInteger(i); - // Only the least significant 2 bits of the integer are used. - int Index = Base + (Element & 0x3); - ShuffleMask.push_back(Index); - } - } else if (auto *CV = dyn_cast(C)) { - assert((unsigned)NumElements == C->getNumOperands() && - "Constant mask has a different number of elements!"); - - for (int i = 0; i < NumElements; ++i) { - int Base = (i * ElementBits / 128) * (128 / ElementBits); - Constant *COp = CV->getOperand(i); - if (isa(COp)) { - ShuffleMask.push_back(SM_SentinelUndef); - continue; - } - uint64_t Element = cast(COp)->getZExtValue(); - // Only the least significant 2 bits of the integer are used. - int Index = Base + (Element & 0x3); - ShuffleMask.push_back(Index); - } - } -} - -} // llvm namespace + unsigned NumLanes = VectorSizeInBits / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l < NumElts; l += NumLaneElts) + for (unsigned i = 0; i < NumLaneElts; ++i) { + int M = SM_SentinelZero; + if (i >= Imm) M = i - Imm + l; + ShuffleMask.push_back(M); + } +} + +void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { + unsigned VectorSizeInBits = VT.getSizeInBits(); + unsigned NumElts = VectorSizeInBits / 8; + unsigned NumLanes = VectorSizeInBits / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l < NumElts; l += NumLaneElts) + for (unsigned i = 0; i < NumLaneElts; ++i) { + unsigned Base = i + Imm; + int M = Base + l; + if (Base >= NumLaneElts) M = SM_SentinelZero; + ShuffleMask.push_back(M); + } +} + +void DecodePALIGNRMask(MVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8); + + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + unsigned Base = i + Offset; + // if i+offset is out of this lane then we actually need the other source + if (Base >= NumLaneElts) Base += NumElts - NumLaneElts; + ShuffleMask.push_back(Base + l); + } + } +} + +/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*. +/// VT indicates the type of the vector allowing it to handle different +/// datatypes and vector widths. +void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + unsigned NewImm = Imm; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + ShuffleMask.push_back(NewImm % NumLaneElts + l); + NewImm /= NumLaneElts; + } + if (NumLaneElts == 4) NewImm = Imm; // reload imm + } +} + +void DecodePSHUFHWMask(MVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + for (unsigned l = 0; l != NumElts; l += 8) { + unsigned NewImm = Imm; + for (unsigned i = 0, e = 4; i != e; ++i) { + ShuffleMask.push_back(l + i); + } + for (unsigned i = 4, e = 8; i != e; ++i) { + ShuffleMask.push_back(l + 4 + (NewImm & 3)); + NewImm >>= 2; + } + } +} + +void DecodePSHUFLWMask(MVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + for (unsigned l = 0; l != NumElts; l += 8) { + unsigned NewImm = Imm; + for (unsigned i = 0, e = 4; i != e; ++i) { + ShuffleMask.push_back(l + (NewImm & 3)); + NewImm >>= 2; + } + for (unsigned i = 4, e = 8; i != e; ++i) { + ShuffleMask.push_back(l + i); + } + } +} + +/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates +/// the type of the vector allowing it to handle different datatypes and vector +/// widths. +void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + unsigned NewImm = Imm; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + // each half of a lane comes from different source + for (unsigned s = 0; s != NumElts*2; s += NumElts) { + for (unsigned i = 0; i != NumLaneElts/2; ++i) { + ShuffleMask.push_back(NewImm % NumLaneElts + s + l); + NewImm /= NumLaneElts; + } + } + if (NumLaneElts == 4) NewImm = Imm; // reload imm + } +} + +/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd +/// and punpckh*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKHMask(MVT VT, SmallVectorImpl &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits() / 128; + if (NumLanes == 0 ) NumLanes = 1; // Handle MMX + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i+NumElts); // Reads from src/src2 + } + } +} + +/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd +/// and punpckl*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKLMask(MVT VT, SmallVectorImpl &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits() / 128; + if (NumLanes == 0 ) NumLanes = 1; // Handle MMX + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i+NumElts); // Reads from src/src2 + } + } +} + +void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask) { + if (Imm & 0x88) + return; // Not a shuffle + + unsigned HalfSize = VT.getVectorNumElements()/2; + + for (unsigned l = 0; l != 2; ++l) { + unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize; + for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i) + ShuffleMask.push_back(i); + } +} + +void DecodePSHUFBMask(const Constant *C, SmallVectorImpl &ShuffleMask) { + Type *MaskTy = C->getType(); + // It is not an error for the PSHUFB mask to not be a vector of i8 because the + // constant pool uniques constants by their bit representation. + // e.g. the following take up the same space in the constant pool: + // i128 -170141183420855150465331762880109871104 + // + // <2 x i64> + // + // <4 x i32> + + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + + if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512. + return; + + // This is a straightforward byte vector. + if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) { + int NumElements = MaskTy->getVectorNumElements(); + ShuffleMask.reserve(NumElements); + + for (int i = 0; i < NumElements; ++i) { + // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte + // lane of the vector we're inside. + int Base = i < 16 ? 0 : 16; + Constant *COp = C->getAggregateElement(i); + if (!COp) { + ShuffleMask.clear(); + return; + } else if (isa(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; + } + uint64_t Element = cast(COp)->getZExtValue(); + // If the high bit (7) of the byte is set, the element is zeroed. + if (Element & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { + // Only the least significant 4 bits of the byte are used. + int Index = Base + (Element & 0xf); + ShuffleMask.push_back(Index); + } + } + } + // TODO: Handle funny-looking vectors too. +} + +void DecodePSHUFBMask(ArrayRef RawMask, + SmallVectorImpl &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + if (M == (uint64_t)SM_SentinelUndef) { + ShuffleMask.push_back(M); + continue; + } + // For AVX vectors with 32 bytes the base of the shuffle is the half of + // the vector we're inside. + int Base = i < 16 ? 0 : 16; + // If the high bit (7) of the byte is set, the element is zeroed. + if (M & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { + // Only the least significant 4 bits of the byte are used. + int Index = Base + (M & 0xf); + ShuffleMask.push_back(Index); + } + } +} + +void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { + int ElementBits = VT.getScalarSizeInBits(); + int NumElements = VT.getVectorNumElements(); + for (int i = 0; i < NumElements; ++i) { + // If there are more than 8 elements in the vector, then any immediate blend + // mask applies to each 128-bit lane. There can never be more than + // 8 elements in a 128-bit lane with an immediate blend. + int Bit = NumElements > 8 ? i % (128 / ElementBits) : i; + assert(Bit < 8 && + "Immediate blends only operate over 8 elements at a time!"); + ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i); + } +} + +/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. +/// No VT provided since it only works on 256-bit, 4 element vectors. +void DecodeVPERMMask(unsigned Imm, SmallVectorImpl &ShuffleMask) { + for (unsigned i = 0; i != 4; ++i) { + ShuffleMask.push_back((Imm >> (2*i)) & 3); + } +} + +void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl &ShuffleMask) { + Type *MaskTy = C->getType(); + assert(MaskTy->isVectorTy() && "Expected a vector constant mask!"); + assert(MaskTy->getVectorElementType()->isIntegerTy() && + "Expected integer constant mask elements!"); + int ElementBits = MaskTy->getScalarSizeInBits(); + int NumElements = MaskTy->getVectorNumElements(); + assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && + "Unexpected number of vector elements."); + ShuffleMask.reserve(NumElements); + if (auto *CDS = dyn_cast(C)) { + assert((unsigned)NumElements == CDS->getNumElements() && + "Constant mask has a different number of elements!"); + + for (int i = 0; i < NumElements; ++i) { + int Base = (i * ElementBits / 128) * (128 / ElementBits); + uint64_t Element = CDS->getElementAsInteger(i); + // Only the least significant 2 bits of the integer are used. + int Index = Base + (Element & 0x3); + ShuffleMask.push_back(Index); + } + } else if (auto *CV = dyn_cast(C)) { + assert((unsigned)NumElements == C->getNumOperands() && + "Constant mask has a different number of elements!"); + + for (int i = 0; i < NumElements; ++i) { + int Base = (i * ElementBits / 128) * (128 / ElementBits); + Constant *COp = CV->getOperand(i); + if (isa(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; + } + uint64_t Element = cast(COp)->getZExtValue(); + // Only the least significant 2 bits of the integer are used. + int Index = Base + (Element & 0x3); + ShuffleMask.push_back(Index); + } + } +} + +void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl &Mask) { + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + unsigned NumDstElts = DstVT.getVectorNumElements(); + unsigned SrcScalarBits = SrcVT.getScalarSizeInBits(); + unsigned DstScalarBits = DstVT.getScalarSizeInBits(); + unsigned Scale = DstScalarBits / SrcScalarBits; + assert(SrcScalarBits < DstScalarBits && + "Expected zero extension mask to increase scalar size"); + assert(NumSrcElts >= NumDstElts && "Too many zero extension lanes"); + + for (unsigned i = 0; i != NumDstElts; i++) { + Mask.push_back(i); + for (unsigned j = 1; j != Scale; j++) + Mask.push_back(SM_SentinelZero); + } +} + +void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + ShuffleMask.push_back(0); + for (unsigned i = 1; i < NumElts; i++) + ShuffleMask.push_back(SM_SentinelZero); +} + +void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl &Mask) { + // First element comes from the first element of second source. + // Remaining elements: Load zero extends / Move copies from first source. + unsigned NumElts = VT.getVectorNumElements(); + Mask.push_back(NumElts); + for (unsigned i = 1; i < NumElts; i++) + Mask.push_back(IsLoad ? SM_SentinelZero : i); +} +} // llvm namespace Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -5496,16 +5496,9 @@ IsUnary = true; break; case X86ISD::MOVSS: - case X86ISD::MOVSD: { - // The index 0 always comes from the first element of the second source, - // this is why MOVSS and MOVSD are used in the first place. The other - // elements come from the other positions of the first source vector - Mask.push_back(NumElems); - for (unsigned i = 1; i != NumElems; ++i) { - Mask.push_back(i); - } + case X86ISD::MOVSD: + DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask); break; - } case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast(ImmN)->getZExtValue(), Mask); @@ -24740,7 +24733,7 @@ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } - + SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), Mld->getBasePtr(), NewMask, WideSrc0, Mld->getMemoryVT(), Mld->getMemOperand(), @@ -24770,7 +24763,7 @@ "Unexpected size for truncating masked store"); // We are going to use the original vector elt for storing. // Accumulated smaller vector elements must be a multiple of the store size. - assert (((NumElems * FromSz) % ToSz) == 0 && + assert (((NumElems * FromSz) % ToSz) == 0 && "Unexpected ratio for truncating masked store"); unsigned SizeRatio = FromSz / ToSz; Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -350,12 +350,12 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] -; SSE2-NEXT: movsd %xmm4, %xmm3 +; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] -; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -800,12 +800,12 @@ ; ; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbq %xmm0, %xmm0 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbq %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle @@ -827,12 +827,12 @@ ; ; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbq %xmm0, %xmm0 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbq %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle @@ -853,12 +853,12 @@ ; ; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbd %xmm0, %xmm0 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbd %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle @@ -881,12 +881,12 @@ ; ; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbd %xmm0, %xmm0 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbd %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle @@ -905,12 +905,12 @@ ; ; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbw %xmm0, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbw %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle @@ -931,12 +931,12 @@ ; ; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbw %xmm0, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbw %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -211,19 +211,19 @@ define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) { ; SSE2-LABEL: shuffle_v2f64_03: ; SSE2: # BB#0: -; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2f64_03: ; SSE3: # BB#0: -; SSE3-NEXT: movsd %xmm0, %xmm1 +; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2f64_03: ; SSSE3: # BB#0: -; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -242,17 +242,17 @@ define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) { ; SSE2-LABEL: shuffle_v2f64_21: ; SSE2: # BB#0: -; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2f64_21: ; SSE3: # BB#0: -; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2f64_21: ; SSSE3: # BB#0: -; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2f64_21: @@ -299,19 +299,19 @@ define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: shuffle_v2i64_03: ; SSE2: # BB#0: -; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2i64_03: ; SSE3: # BB#0: -; SSE3-NEXT: movsd %xmm0, %xmm1 +; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2i64_03: ; SSSE3: # BB#0: -; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -335,19 +335,19 @@ define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: shuffle_v2i64_03_copy: ; SSE2: # BB#0: -; SSE2-NEXT: movsd %xmm1, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2i64_03_copy: ; SSE3: # BB#0: -; SSE3-NEXT: movsd %xmm1, %xmm2 +; SSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE3-NEXT: movaps %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2i64_03_copy: ; SSSE3: # BB#0: -; SSSE3-NEXT: movsd %xmm1, %xmm2 +; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSSE3-NEXT: movaps %xmm2, %xmm0 ; SSSE3-NEXT: retq ; @@ -489,17 +489,17 @@ define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: shuffle_v2i64_21: ; SSE2: # BB#0: -; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2i64_21: ; SSE3: # BB#0: -; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2i64_21: ; SSSE3: # BB#0: -; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2i64_21: @@ -522,19 +522,19 @@ define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: shuffle_v2i64_21_copy: ; SSE2: # BB#0: -; SSE2-NEXT: movsd %xmm2, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2i64_21_copy: ; SSE3: # BB#0: -; SSE3-NEXT: movsd %xmm2, %xmm1 +; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2i64_21_copy: ; SSSE3: # BB#0: -; SSSE3-NEXT: movsd %xmm2, %xmm1 +; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -650,12 +650,12 @@ define <2 x i64> @shuffle_v2i64_0z(<2 x i64> %a) { ; SSE-LABEL: shuffle_v2i64_0z: ; SSE: # BB#0: -; SSE-NEXT: movq %xmm0, %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v2i64_0z: ; AVX: # BB#0: -; AVX-NEXT: vmovq %xmm0, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> ret <2 x i64> %shuffle @@ -693,19 +693,19 @@ ; SSE2-LABEL: shuffle_v2i64_z1: ; SSE2: # BB#0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2i64_z1: ; SSE3: # BB#0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2i64_z1: ; SSSE3: # BB#0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2i64_z1: @@ -732,12 +732,12 @@ define <2 x double> @shuffle_v2f64_0z(<2 x double> %a) { ; SSE-LABEL: shuffle_v2f64_0z: ; SSE: # BB#0: -; SSE-NEXT: movq %xmm0, %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v2f64_0z: ; AVX: # BB#0: -; AVX-NEXT: vmovq %xmm0, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> ret <2 x double> %shuffle @@ -780,19 +780,19 @@ ; SSE2-LABEL: shuffle_v2f64_z1: ; SSE2: # BB#0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2f64_z1: ; SSE3: # BB#0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2f64_z1: ; SSSE3: # BB#0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2f64_z1: @@ -828,12 +828,12 @@ define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) { ; SSE-LABEL: insert_mem_and_zero_v2i64: ; SSE: # BB#0: -; SSE-NEXT: movq (%rdi), %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; ; AVX-LABEL: insert_mem_and_zero_v2i64: ; AVX: # BB#0: -; AVX-NEXT: vmovq (%rdi), %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: retq %a = load i64* %ptr %v = insertelement <2 x i64> undef, i64 %a, i32 0 @@ -844,12 +844,12 @@ define <2 x double> @insert_reg_and_zero_v2f64(double %a) { ; SSE-LABEL: insert_reg_and_zero_v2f64: ; SSE: # BB#0: -; SSE-NEXT: movq %xmm0, %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: retq ; ; AVX-LABEL: insert_reg_and_zero_v2f64: ; AVX: # BB#0: -; AVX-NEXT: vmovq %xmm0, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: retq %v = insertelement <2 x double> undef, double %a, i32 0 %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> @@ -859,12 +859,12 @@ define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) { ; SSE-LABEL: insert_mem_and_zero_v2f64: ; SSE: # BB#0: -; SSE-NEXT: movsd (%rdi), %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; ; AVX-LABEL: insert_mem_and_zero_v2f64: ; AVX: # BB#0: -; AVX-NEXT: vmovsd (%rdi), %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: retq %a = load double* %ptr %v = insertelement <2 x double> undef, double %a, i32 0 @@ -876,19 +876,19 @@ ; SSE2-LABEL: insert_reg_lo_v2i64: ; SSE2: # BB#0: ; SSE2-NEXT: movd %rdi, %xmm1 -; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_reg_lo_v2i64: ; SSE3: # BB#0: ; SSE3-NEXT: movd %rdi, %xmm1 -; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_reg_lo_v2i64: ; SSSE3: # BB#0: ; SSSE3-NEXT: movd %rdi, %xmm1 -; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_reg_lo_v2i64: @@ -931,19 +931,19 @@ ; ; SSE41-LABEL: insert_mem_lo_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: movq (%rdi), %xmm1 +; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: insert_mem_lo_v2i64: ; AVX1: # BB#0: -; AVX1-NEXT: vmovq (%rdi), %xmm1 +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_mem_lo_v2i64: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq (%rdi), %xmm1 +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-NEXT: retq %a = load i64* %ptr @@ -972,13 +972,13 @@ define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) { ; SSE-LABEL: insert_mem_hi_v2i64: ; SSE: # BB#0: -; SSE-NEXT: movq (%rdi), %xmm1 +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: insert_mem_hi_v2i64: ; AVX: # BB#0: -; AVX-NEXT: vmovq (%rdi), %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %a = load i64* %ptr @@ -990,13 +990,13 @@ define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) { ; SSE-LABEL: insert_reg_lo_v2f64: ; SSE: # BB#0: -; SSE-NEXT: movsd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: insert_reg_lo_v2f64: ; AVX: # BB#0: -; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; AVX-NEXT: retq %v = insertelement <2 x double> undef, double %a, i32 0 %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> @@ -1085,7 +1085,7 @@ define <2 x double> @insert_dup_mem_v2f64(double* %ptr) { ; SSE2-LABEL: insert_dup_mem_v2f64: ; SSE2: # BB#0: -; SSE2-NEXT: movsd (%rdi), %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] ; SSE2-NEXT: retq ; Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -441,21 +441,21 @@ ; SSE2-LABEL: shuffle_v4f32_4zzz: ; SSE2: # BB#0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_4zzz: ; SSE3: # BB#0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_4zzz: ; SSSE3: # BB#0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -661,21 +661,21 @@ ; SSE2-LABEL: shuffle_v4i32_4zzz: ; SSE2: # BB#0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_4zzz: ; SSE3: # BB#0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_4zzz: ; SSSE3: # BB#0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -698,21 +698,21 @@ ; SSE2-LABEL: shuffle_v4i32_z4zz: ; SSE2: # BB#0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_z4zz: ; SSE3: # BB#0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_z4zz: ; SSSE3: # BB#0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] ; SSSE3-NEXT: retq ; @@ -737,21 +737,21 @@ ; SSE2-LABEL: shuffle_v4i32_zz4z: ; SSE2: # BB#0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_zz4z: ; SSE3: # BB#0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_zz4z: ; SSSE3: # BB#0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] ; SSSE3-NEXT: retq ; @@ -1033,12 +1033,12 @@ ; ; SSE41-LABEL: shuffle_v4i32_0u1u: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxdq %xmm0, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_0u1u: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxdq %xmm0, %xmm0 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -1065,12 +1065,12 @@ ; ; SSE41-LABEL: shuffle_v4i32_0z1z: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxdq %xmm0, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_0z1z: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxdq %xmm0, %xmm0 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle @@ -1094,12 +1094,12 @@ define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) { ; SSE-LABEL: insert_mem_and_zero_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movd (%rdi), %xmm0 +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: insert_mem_and_zero_v4i32: ; AVX: # BB#0: -; AVX-NEXT: vmovd (%rdi), %xmm0 +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: retq %a = load i32* %ptr %v = insertelement <4 x i32> undef, i32 %a, i32 0 @@ -1111,21 +1111,21 @@ ; SSE2-LABEL: insert_reg_and_zero_v4f32: ; SSE2: # BB#0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_reg_and_zero_v4f32: ; SSE3: # BB#0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_reg_and_zero_v4f32: ; SSSE3: # BB#0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -1138,7 +1138,7 @@ ; AVX-LABEL: insert_reg_and_zero_v4f32: ; AVX: # BB#0: ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq %v = insertelement <4 x float> undef, float %a, i32 0 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> @@ -1148,12 +1148,12 @@ define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) { ; SSE-LABEL: insert_mem_and_zero_v4f32: ; SSE: # BB#0: -; SSE-NEXT: movss (%rdi), %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: insert_mem_and_zero_v4f32: ; AVX: # BB#0: -; AVX-NEXT: vmovss (%rdi), %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: retq %a = load float* %ptr %v = insertelement <4 x float> undef, float %a, i32 0 @@ -1165,19 +1165,19 @@ ; SSE2-LABEL: insert_reg_lo_v4i32: ; SSE2: # BB#0: ; SSE2-NEXT: movd %rdi, %xmm1 -; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_reg_lo_v4i32: ; SSE3: # BB#0: ; SSE3-NEXT: movd %rdi, %xmm1 -; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_reg_lo_v4i32: ; SSSE3: # BB#0: ; SSSE3-NEXT: movd %rdi, %xmm1 -; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_reg_lo_v4i32: @@ -1221,19 +1221,19 @@ ; ; SSE41-LABEL: insert_mem_lo_v4i32: ; SSE41: # BB#0: -; SSE41-NEXT: movq (%rdi), %xmm1 +; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: insert_mem_lo_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vmovq (%rdi), %xmm1 +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_mem_lo_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq (%rdi), %xmm1 +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-NEXT: retq %a = load <2 x i32>* %ptr @@ -1263,13 +1263,13 @@ define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) { ; SSE-LABEL: insert_mem_hi_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movq (%rdi), %xmm1 +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: insert_mem_hi_v4i32: ; AVX: # BB#0: -; AVX-NEXT: vmovq (%rdi), %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %a = load <2 x i32>* %ptr @@ -1281,13 +1281,13 @@ define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) { ; SSE-LABEL: insert_reg_lo_v4f32: ; SSE: # BB#0: -; SSE-NEXT: movsd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: insert_reg_lo_v4f32: ; AVX: # BB#0: -; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; AVX-NEXT: retq %a.cast = bitcast double %a to <2 x float> %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1829,12 +1829,12 @@ ; ; SSE41-LABEL: shuffle_v8i16_0uuu1uuu: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxwq %xmm0, %xmm0 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_0uuu1uuu: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxwq %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %shuffle @@ -1857,12 +1857,12 @@ ; ; SSE41-LABEL: shuffle_v8i16_0zzz1zzz: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxwq %xmm0, %xmm0 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_0zzz1zzz: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxwq %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %shuffle @@ -1881,12 +1881,12 @@ ; ; SSE41-LABEL: shuffle_v8i16_0u1u2u3u: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxwd %xmm0, %xmm0 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_0u1u2u3u: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxwd %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %shuffle @@ -1907,12 +1907,12 @@ ; ; SSE41-LABEL: shuffle_v8i16_0z1z2z3z: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxwd %xmm0, %xmm0 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_0z1z2z3z: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxwd %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -3,13 +3,13 @@ target triple = "x86_64-unknown-unknown" -define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_0000: -; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; +define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0000: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; ; AVX2-LABEL: shuffle_v4f64_0000: ; AVX2: # BB#0: ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 @@ -18,13 +18,13 @@ ret <4 x double> %shuffle } -define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_0001: -; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; +define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0001: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; ; AVX2-LABEL: shuffle_v4f64_0001: ; AVX2: # BB#0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] @@ -35,13 +35,13 @@ define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_0020: -; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; ; AVX2-LABEL: shuffle_v4f64_0020: ; AVX2: # BB#0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] @@ -67,13 +67,13 @@ } define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_1000: -; AVX1: # BB#0: -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; +; AVX1-LABEL: shuffle_v4f64_1000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; ; AVX2-LABEL: shuffle_v4f64_1000: ; AVX2: # BB#0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] @@ -83,13 +83,13 @@ } define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_2200: -; AVX1: # BB#0: -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_2200: +; AVX1-LABEL: shuffle_v4f64_2200: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_2200: ; AVX2: # BB#0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] ; AVX2-NEXT: retq @@ -138,13 +138,13 @@ ret <4 x double> %shuffle } -define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_0022: -; ALL: # BB#0: -; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; ALL-NEXT: retq - %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> - ret <4 x double> %shuffle +define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0022: +; ALL: # BB#0: +; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) { @@ -183,13 +183,13 @@ ret <4 x double> %shuffle } -define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_0423: -; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-NEXT: retq -; +define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0423: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-NEXT: retq +; ; AVX2-LABEL: shuffle_v4f64_0423: ; AVX2: # BB#0: ; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 @@ -199,14 +199,14 @@ ret <4 x double> %shuffle } -define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_0462: -; ALL: # BB#0: -; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] -; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] -; ALL-NEXT: retq - %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> +define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0462: +; ALL: # BB#0: +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -358,13 +358,13 @@ ret <4 x double> %shuffle } -define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) { -; AVX1-LABEL: shuffle_v4i64_0000: -; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; +define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0000: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; ; AVX2-LABEL: shuffle_v4i64_0000: ; AVX2: # BB#0: ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 @@ -373,13 +373,13 @@ ret <4 x i64> %shuffle } -define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) { -; AVX1-LABEL: shuffle_v4i64_0001: -; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; +define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0001: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; ; AVX2-LABEL: shuffle_v4i64_0001: ; AVX2: # BB#0: ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] @@ -390,13 +390,13 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0020: -; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; ; AVX2-LABEL: shuffle_v4i64_0020: ; AVX2: # BB#0: ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] @@ -438,13 +438,13 @@ } define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) { -; AVX1-LABEL: shuffle_v4i64_1000: -; AVX1: # BB#0: -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; +; AVX1-LABEL: shuffle_v4i64_1000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; ; AVX2-LABEL: shuffle_v4i64_1000: ; AVX2: # BB#0: ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] @@ -454,13 +454,13 @@ } define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) { -; AVX1-LABEL: shuffle_v4i64_2200: -; AVX1: # BB#0: -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4i64_2200: +; AVX1-LABEL: shuffle_v4i64_2200: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_2200: ; AVX2: # BB#0: ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] ; AVX2-NEXT: retq @@ -500,13 +500,13 @@ ret <4 x i64> %shuffle } -define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { -; AVX1-LABEL: shuffle_v4i64_0124: -; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-NEXT: retq +define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0124: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_0124: ; AVX2: # BB#0: @@ -538,13 +538,13 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0412: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-NEXT: retq -; +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-NEXT: retq +; ; AVX2-LABEL: shuffle_v4i64_0412: ; AVX2: # BB#0: ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] @@ -557,13 +557,13 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_4012: -; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] -; AVX1-NEXT: retq +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_4012: ; AVX2: # BB#0: @@ -794,14 +794,14 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { ; AVX1-LABEL: insert_mem_and_zero_v4i64: ; AVX1: # BB#0: -; AVX1-NEXT: vmovq (%rdi), %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_mem_and_zero_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: retq @@ -815,7 +815,7 @@ ; ALL-LABEL: insert_reg_and_zero_v4f64: ; ALL: # BB#0: ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; ALL-NEXT: retq %v = insertelement <4 x double> undef, double %a, i32 0 %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> @@ -825,7 +825,7 @@ define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) { ; ALL-LABEL: insert_mem_and_zero_v4f64: ; ALL: # BB#0: -; ALL-NEXT: vmovsd (%rdi), %xmm0 +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: retq %a = load double* %ptr %v = insertelement <4 x double> undef, double %a, i32 0 @@ -872,13 +872,13 @@ ret <4 x double> %3 } -define <4 x double> @splat_v4f64(<2 x double> %r) { -; AVX1-LABEL: splat_v4f64: -; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; +define <4 x double> @splat_v4f64(<2 x double> %r) { +; AVX1-LABEL: splat_v4f64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; ; AVX2-LABEL: splat_v4f64: ; AVX2: # BB#0: ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1853,7 +1853,7 @@ define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) { ; ALL-LABEL: concat_v2f32_1: ; ALL: # BB#0: # %entry -; ALL-NEXT: vmovq (%rdi), %xmm0 +; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: @@ -1868,7 +1868,7 @@ define <8x float> @concat_v2f32_2(<2 x float>* %tmp64, <2 x float>* %tmp65) { ; ALL-LABEL: concat_v2f32_2: ; ALL: # BB#0: # %entry -; ALL-NEXT: vmovq (%rdi), %xmm0 +; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: @@ -1881,7 +1881,7 @@ define <8x float> @concat_v2f32_3(<2 x float>* %tmp64, <2 x float>* %tmp65) { ; ALL-LABEL: concat_v2f32_3: ; ALL: # BB#0: # %entry -; ALL-NEXT: vmovq (%rdi), %xmm0 +; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: