diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -308,6 +308,15 @@ Instruction *I = nullptr) const override; bool isLegalICmpImmediate(int64_t Imm) const override; bool isLegalAddImmediate(int64_t Imm) const override; + + unsigned getMaxSupportedInterleaveFactor() const override { return 4; } + bool isLegalInterleavedAccessType(VectorType *VecTy, + const DataLayout &DL) const; + bool lowerInterleavedLoad(LoadInst *LI, + ArrayRef Shuffles, + ArrayRef Indices, + unsigned Factor) const override; + bool isTruncateFree(Type *SrcTy, Type *DstTy) const override; bool isTruncateFree(EVT SrcVT, EVT DstVT) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -20,6 +20,7 @@ #include "RISCVTargetMachine.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -28,8 +29,8 @@ #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" -#include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" @@ -953,6 +954,143 @@ return isInt<12>(Imm); } +bool RISCVTargetLowering::isLegalInterleavedAccessType( + VectorType *VecTy, const DataLayout &DL) const { + + unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + + unsigned NumElements = + isa(VecTy) + ? cast(VecTy)->getMinNumElements() + : cast(VecTy)->getNumElements(); + // Ensure the number of vector elements is greater than 1. + if (NumElements < 2) + return false; + + // Ensure the element type is legal. + if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) + return false; + + if (VecTy->getElementType()->isHalfTy()) + return Subtarget.hasStdExtZfh(); + if (VecTy->getElementType()->isFloatTy()) + return Subtarget.hasStdExtF(); + if (VecTy->getElementType()->isDoubleTy()) + return Subtarget.hasStdExtD(); + + return true; +} + +bool RISCVTargetLowering::lowerInterleavedLoad( + LoadInst *LI, ArrayRef Shuffles, + ArrayRef Indices, unsigned Factor) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); + assert(!Shuffles.empty() && "Empty shufflevector input"); + assert(Shuffles.size() == Indices.size() && + "Unmatched number of shufflevectors and indices"); + + const DataLayout &DL = LI->getModule()->getDataLayout(); + + VectorType *VTy = Shuffles[0]->getType(); + + // Skip if we do not have StdExtV and skip illegal vector types. + if (!Subtarget.hasStdExtV() || !isLegalInterleavedAccessType(VTy, DL)) + return false; + + ScalableVectorType *SVTy; + bool IsScalableVector = isa(VTy); + if (!IsScalableVector) { + auto *FVTy = cast(VTy); + SVTy = + ScalableVectorType::get(FVTy->getElementType(), FVTy->getNumElements()); + } else { + SVTy = cast(VTy); + } + + // A pointer vector can not be the return type of the vlsegN intrinsics. Need + // to load integer vectors first and then convert to pointer vectors. + Type *EltTy = SVTy->getElementType(); + if (EltTy->isPointerTy()) + SVTy = ScalableVectorType::get(DL.getIntPtrType(EltTy), + SVTy->getMinNumElements()); + + IRBuilder<> Builder(LI); + + // The base address of the load. + Value *BaseAddr = LI->getPointerOperand(); + + Type *PtrTy = + SVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()); + Type *IntTy; + Triple T = getTargetMachine().getTargetTriple(); + if (T.isArch64Bit()) + IntTy = IntegerType::get(SVTy->getContext(), 64); + else { + assert(T.isArch32Bit() && "only RV32 and RV64 are currently supported"); + IntTy = IntegerType::get(SVTy->getContext(), 32); + } + + Type *Tys[] = {SVTy, IntTy}; + static const Intrinsic::ID VlsegInts[3] = {Intrinsic::riscv_vlseg2, + Intrinsic::riscv_vlseg3, + Intrinsic::riscv_vlseg4}; + Function *VlsegNFunc = + Intrinsic::getDeclaration(LI->getModule(), VlsegInts[Factor - 2], Tys); + + DenseMap> SubVecs; + + Value *VL; + if (!IsScalableVector) + VL = ConstantInt::get(IntTy, cast(VTy)->getNumElements()); + else { + Function *VscaleFunc = + Intrinsic::getDeclaration(LI->getModule(), Intrinsic::vscale, IntTy); + VL = Builder.CreateCall(VscaleFunc, {}, "vscale"); + } + CallInst *VlsegN = Builder.CreateCall( + VlsegNFunc, {Builder.CreateBitCast(BaseAddr, PtrTy), VL}, "vlsegN"); + + // Extract and store the sub-vectors returned by the load intrinsic. + for (unsigned I = 0; I < Shuffles.size(); I++) { + ShuffleVectorInst *SVI = Shuffles[I]; + unsigned Index = Indices[I]; + + Value *SubVec = Builder.CreateExtractValue(VlsegN, Index); + + // Convert the integer vector to pointer vector if the element is pointer. + if (EltTy->isPointerTy()) + SubVec = Builder.CreateIntToPtr( + SubVec, ScalableVectorType::get(SVI->getType()->getElementType(), + SVTy->getMinNumElements())); + + if (!IsScalableVector) { + Type *IntrinsicTypes[] = {cast(VTy), SubVec->getType()}; + Function *VExtractFunc = Intrinsic::getDeclaration( + LI->getModule(), Intrinsic::experimental_vector_extract, + IntrinsicTypes); + Value *ExtractSubVec = Builder.CreateCall( + VExtractFunc, {SubVec, ConstantInt::get(IntTy, 0)}, ""); + SubVecs[SVI].push_back(ExtractSubVec); + } else { + SubVecs[SVI].push_back(SubVec); + } + } + + // Replace uses of the shufflevector instructions with the sub-vectors + // returned by the load intrinsic. If a shufflevector instruction is + // associated with more than one sub-vector, those sub-vectors will be + // concatenated into a single wide vector. + for (ShuffleVectorInst *SVI : Shuffles) { + auto &SubVec = SubVecs[SVI]; + auto *WideVec = + SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; + SVI->replaceAllUsesWith(WideVec); + } + + return true; +} + // On RV32, 64-bit integers are split into their high and low parts and held // in two different registers, so the trunc is free since the low register can // just be used. diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -150,6 +150,7 @@ void RISCVPassConfig::addIRPasses() { addPass(createAtomicExpandPass()); TargetPassConfig::addIRPasses(); + addPass(createInterleavedAccessPass()); } bool RISCVPassConfig::addInstSelector() { diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple riscv64-linux-gnu -interleaved-access \ +; RUN: -mattr=+experimental-v -S < %s | FileCheck %s + +define <8 x i8> @load_factor2(<16 x i8>* %ptr) { +; CHECK-LABEL: @load_factor2( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[PTR:%.*]] to i8* +; CHECK-NEXT: [[VLSEGN:%.*]] = call { , } @llvm.riscv.vlseg2.nxv8i8.i64(i8* [[TMP1]], i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[VLSEGN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv8i8( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[VLSEGN]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv8i8( [[TMP4]], i64 0) +; CHECK-NEXT: ret <8 x i8> [[TMP3]] +; + %interleaved.vec = load <16 x i8>, <16 x i8>* %ptr, align 4 + %v0 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> poison, <8 x i32> + %v1 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> poison, <8 x i32> + ret <8 x i8> %v1 +} + +define <4 x i32> @load_factor3(<12 x i32>* %ptr) { +; CHECK-LABEL: @load_factor3( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i32* +; CHECK-NEXT: [[VLSEGN:%.*]] = call { , , } @llvm.riscv.vlseg3.nxv4i32.i64(i32* [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[VLSEGN]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[VLSEGN]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[VLSEGN]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( [[TMP6]], i64 0) +; CHECK-NEXT: ret <4 x i32> [[TMP3]] +; + %interleaved.vec = load <12 x i32>, <12 x i32>* %ptr, align 4 + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + ret <4 x i32> %v2 +} + +define <4 x i32> @load_factor4(<16 x i32>* %ptr) { +; CHECK-LABEL: @load_factor4( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* +; CHECK-NEXT: [[VLSEGN:%.*]] = call { , , , } @llvm.riscv.vlseg4.nxv4i32.i64(i32* [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[VLSEGN]], 3 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[VLSEGN]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[VLSEGN]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[VLSEGN]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( [[TMP8]], i64 0) +; CHECK-NEXT: ret <4 x i32> [[TMP3]] +; + %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4 + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + ret <4 x i32> %v3 +} diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/lit.local.cfg b/llvm/test/Transforms/InterleavedAccess/RISCV/lit.local.cfg new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'RISCV' in config.root.targets: + config.unsupported = True