Index: X86InterleavedAccess.cpp =================================================================== --- X86InterleavedAccess.cpp +++ X86InterleavedAccess.cpp @@ -0,0 +1,152 @@ +//===------- X86InterleavedAccess.cpp --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the interleaved accesses +// optimization generating X86-specific instructions/intrinsics for interleaved +// access groups. +// +//===----------------------------------------------------------------------===// + +#include "X86ISelLowering.h" +#include "X86TargetMachine.h" + +using namespace llvm; + +/// Returns true if the interleaved access group represented by the shuffles +/// are supported for the subtarget. Returns false otherwise. +static bool isSupported(const X86Subtarget &SubTarget, + const ArrayRef Shuffles) { + + const DataLayout &DL = Shuffles[0]->getModule()->getDataLayout(); + VectorType *ShuffleVecTy = Shuffles[0]->getType(); + unsigned ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy); + Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType(); + + // Currently, lowering is supported only for four interleaved accesses of + // 64 bits on AVX2. + if (!SubTarget.hasAVX() || ShuffleVecSize != 256 || + DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Shuffles.size() != 4) + return false; + + return true; +} + +/// \brief Lower interleaved load(s) into target specific instructions/ +/// intrinsics. Lowering sequence varies depending on the vector-types, factor, +/// number of shuffles and ISA. +/// +/// Current supported interleaved loads: here, T = {i/f} +/// %wide.vec = load <16 x T64>, <16 x T64>* %ptr +/// %v0 = shuffle %wide.vec, undef, <0, 4, 8, 12> ; +/// %v1 = shuffle %wide.vec, undef, <1, 5, 9, 13> ; +/// %v2 = shuffle %wide.vec, undef, <2, 6, 10, 14> ; +/// %v3 = shuffle %wide.vec, undef, <3, 7, 11, 15> ; +/// +/// Into: +/// %load0 = load <4 x T64>, <4 x T64>* %ptr +/// %load1 = load <4 x T64>, <4 x T64>* %ptr+32 +/// %load2 = load <4 x T64>, <4 x T64>* %ptr+64 +/// %load3 = load <4 x T64>, <4 x T64>* %ptr+96 +/// +/// %intrshuffvec1 = shuffle %load0, %load2, <0, 1, 4, 5>; +/// %intrshuffvec2 = shuffle %load1, %load3, <0, 1, 4, 5>; +/// %v0 = shuffle %intrshuffvec1, %intrshuffvec2, <0, 4, 2, 6>; +/// %v1 = shuffle %intrshuffvec1, %intrshuffvec2, <1, 5, 3, 7>; +/// +/// %intrshuffvec3 = shuffle %load0, %load2, <2, 3, 6, 7>; +/// %intrshuffvec4 = shuffle %load1, %load3, <2, 3, 6, 7>; +/// %v2 = shuffle %intrshuffvec3, %intrshuffvec4, <0, 4, 2, 6>; +/// %v3 = shuffle %intrshuffvec3, %intrshuffvec4, <1, 5, 3, 7>; +/// +static bool lower(LoadInst *LI, ArrayRef Shuffles, + unsigned Factor) { + const DataLayout &DL = LI->getModule()->getDataLayout(); + + VectorType *ShuffleVecTy = Shuffles[0]->getType(); + unsigned ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy); + Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType(); + + assert(DL.getTypeSizeInBits(LI->getType()) == Factor * ShuffleVecSize && + "Unexpected load size"); + + Type *NewBasePtrTy = ShuffleVecTy->getPointerTo(LI->getPointerAddressSpace()); + Type *ScalarPtrTy = ShuffleEltTy->getPointerTo(LI->getPointerAddressSpace()); + + IRBuilder<> Builder(LI); + SmallVector NewLoads; + + Value *ScalarBasePtr = + Builder.CreateBitCast(LI->getPointerOperand(), ScalarPtrTy); + + // Generate 4 loads of type v4xT64 + for (unsigned Part = 0; Part < Factor; Part++) { + Value *NewBasePtr = Builder.CreateGEP(nullptr, ScalarBasePtr, + Builder.getInt32(Part * Factor)); + NewBasePtr = Builder.CreateBitCast(NewBasePtr, NewBasePtrTy); + Instruction *NewLoad = + Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment()); + NewLoads.push_back(NewLoad); + } + + // dst = src1[0,1],src2[0,1] + uint32_t IntMask1[] = {0, 1, 4, 5}; + ArrayRef ShuffleMask = makeArrayRef(IntMask1, 4); + Value *IntrVec1 = + Builder.CreateShuffleVector(NewLoads[0], NewLoads[2], ShuffleMask); + Value *IntrVec2 = + Builder.CreateShuffleVector(NewLoads[1], NewLoads[3], ShuffleMask); + + // dst = src1[2,3],src2[2,3] + uint32_t IntMask2[] = {2, 3, 6, 7}; + ShuffleMask = makeArrayRef(IntMask2, 4); + Value *IntrVec3 = + Builder.CreateShuffleVector(NewLoads[0], NewLoads[2], ShuffleMask); + Value *IntrVec4 = + Builder.CreateShuffleVector(NewLoads[1], NewLoads[3], ShuffleMask); + + // dst = src1[0],src2[0],src1[2],src2[2] + uint32_t IntMask3[] = {0, 4, 2, 6}; + ShuffleMask = makeArrayRef(IntMask3, 4); + Shuffles[3]->replaceAllUsesWith( + Builder.CreateShuffleVector(IntrVec1, IntrVec2, ShuffleMask)); + Shuffles[1]->replaceAllUsesWith( + Builder.CreateShuffleVector(IntrVec3, IntrVec4, ShuffleMask)); + + // dst = src1[1],src2[1],src1[3],src2[3] + uint32_t IntMask4[] = {1, 5, 3, 7}; + ShuffleMask = makeArrayRef(IntMask4, 4); + Shuffles[2]->replaceAllUsesWith( + Builder.CreateShuffleVector(IntrVec1, IntrVec2, ShuffleMask)); + Shuffles[0]->replaceAllUsesWith( + Builder.CreateShuffleVector(IntrVec3, IntrVec4, ShuffleMask)); + return true; +} + +bool X86TargetLowering::lowerInterleavedLoad( + LoadInst *LI, ArrayRef Shuffles, + ArrayRef Indices, unsigned Factor) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); + assert(!Shuffles.empty() && "Empty shufflevector input"); + assert(Shuffles.size() == Indices.size() && + "Unmatched number of shufflevectors and indices"); + + SmallSetVector UniqueIndices; + for (unsigned Index : Indices) + UniqueIndices.insert(Index); + + if (Shuffles.size() != UniqueIndices.size()) + return false; + + if (!isSupported(Subtarget, Shuffles)) { + return false; + } + + return lower(LI, Shuffles, Factor); +}