Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -77,14 +77,14 @@ /// /// \returns True if 16 bit binary operation is promoted to equivalent 32 bit /// binary operation, false otherwise. - bool promoteUniformI16OpToI32Op(BinaryOperator &I) const; + bool promoteUniformI16OpToI32(BinaryOperator &I) const; /// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp' /// operation by sign or zero extending operands to 32 bits, and replacing 16 /// bit operation with 32 bit operation. /// /// \returns True. - bool promoteUniformI16OpToI32Op(ICmpInst &I) const; + bool promoteUniformI16OpToI32(ICmpInst &I) const; /// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select' /// operation by sign or zero extending operands to 32 bits, replacing 16 bit @@ -92,7 +92,16 @@ /// operation back to 16 bits. /// /// \returns True. - bool promoteUniformI16OpToI32Op(SelectInst &I) const; + bool promoteUniformI16OpToI32(SelectInst &I) const; + + /// \brief Promotes uniform 16 bit 'bitreverse' intrinsic \p I to 32 bit + /// 'bitreverse' intrinsic by zero extending operand to 32 bits, replacing 16 + /// bit intrinsic with 32 bit intrinsic, shifting the result of 32 bit + /// intrinsic 16 bits to the right with zero fill, and truncating the result + /// of shift operation back to 16 bits. + /// + /// \returns True. + bool promoteUniformI16BitreverseIntrinsicToI32(IntrinsicInst &I) const; public: static char ID; @@ -111,6 +120,9 @@ bool visitICmpInst(ICmpInst &I); bool visitSelectInst(SelectInst &I); + bool visitIntrinsicInst(IntrinsicInst &I); + bool visitBitreverseIntrinsicInst(IntrinsicInst &I); + bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; @@ -183,8 +195,8 @@ cast(I.getOperand(0))->isSigned() : false; } -bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const { - assert(isI16Ty(I.getType()) && "Op must be 16 bits"); +bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32(BinaryOperator &I) const { + assert(isI16Ty(I.getType()) && "I must be 16 bits"); if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv) return false; @@ -214,7 +226,7 @@ return true; } -bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(ICmpInst &I) const { +bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32(ICmpInst &I) const { assert(isI16Ty(I.getOperand(0)->getType()) && "Op0 must be 16 bits"); assert(isI16Ty(I.getOperand(1)->getType()) && "Op1 must be 16 bits"); @@ -242,8 +254,8 @@ return true; } -bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(SelectInst &I) const { - assert(isI16Ty(I.getType()) && "Op must be 16 bits"); +bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32(SelectInst &I) const { + assert(isI16Ty(I.getType()) && "I must be 16 bits"); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); @@ -270,6 +282,33 @@ return true; } +bool AMDGPUCodeGenPrepare::promoteUniformI16BitreverseIntrinsicToI32( + IntrinsicInst &I) const { + assert(I.getIntrinsicID() == Intrinsic::bitreverse && "I must be bitreverse"); + assert(isI16Ty(I.getType()) && "I must be 16 bits"); + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = getI32Ty(Builder, I.getType()); + Function *I32 = nullptr; + Value *ExtOp = nullptr; + Value *ExtRes = nullptr; + Value *LShrOp = nullptr; + Value *TruncRes = nullptr; + + ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); + I32 = Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); + ExtRes = Builder.CreateCall(I32, { ExtOp }); + LShrOp = Builder.CreateLShr(ExtRes, 16); + TruncRes = Builder.CreateTrunc(LShrOp, getI16Ty(Builder, ExtRes->getType())); + + I.replaceAllUsesWith(TruncRes); + I.eraseFromParent(); + + return true; +} + static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { const ConstantFP *CNum = dyn_cast(Num); if (!CNum) @@ -359,7 +398,7 @@ // TODO: Should we promote smaller types that will be legalized to i16? if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I)) - Changed |= promoteUniformI16OpToI32Op(I); + Changed |= promoteUniformI16OpToI32(I); return Changed; } @@ -370,7 +409,7 @@ // TODO: Should we promote smaller types that will be legalized to i16? if (ST->has16BitInsts() && isI16Ty(I.getOperand(0)->getType()) && isI16Ty(I.getOperand(1)->getType()) && DA->isUniform(&I)) - Changed |= promoteUniformI16OpToI32Op(I); + Changed |= promoteUniformI16OpToI32(I); return Changed; } @@ -380,7 +419,26 @@ // TODO: Should we promote smaller types that will be legalized to i16? if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I)) - Changed |= promoteUniformI16OpToI32Op(I); + Changed |= promoteUniformI16OpToI32(I); + + return Changed; +} + +bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { + bool Changed = false; + + if (I.getIntrinsicID() == Intrinsic::bitreverse) + Changed |= visitBitreverseIntrinsicInst(I); + + return Changed; +} + +bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { + bool Changed = false; + + // TODO: Should we promote smaller types that will be legalized to i16? + if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I)) + Changed |= promoteUniformI16BitreverseIntrinsicToI32(I); return Changed; } Index: test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll +++ test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll @@ -430,6 +430,18 @@ ret i16 %sel } +declare i16 @llvm.bitreverse.i16(i16) +; VI-LABEL: @bitreverse_i16( +; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; VI: %[[R_32:[0-9]+]] = call i32 @llvm.bitreverse.i32(i32 %[[A_32]]) +; VI: %[[S_32:[0-9]+]] = lshr i32 %[[R_32]], 16 +; VI: %[[R_16:[0-9]+]] = trunc i32 %[[S_32]] to i16 +; VI: ret i16 %[[R_16]] +define i16 @bitreverse_i16(i16 %a) { + %brev = call i16 @llvm.bitreverse.i16(i16 %a) + ret i16 %brev +} + ; VI-LABEL: @add_3xi16( ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> @@ -854,3 +866,15 @@ %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b ret <3 x i16> %sel } + +declare <3 x i16> @llvm.bitreverse.v3i16(<3 x i16>) +; VI-LABEL: @bitreverse_3xi16( +; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> +; VI: %[[R_32:[0-9]+]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> %[[A_32]]) +; VI: %[[S_32:[0-9]+]] = lshr <3 x i32> %[[R_32]], +; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[S_32]] to <3 x i16> +; VI: ret <3 x i16> %[[R_16]] +define <3 x i16> @bitreverse_3xi16(<3 x i16> %a) { + %brev = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> %a) + ret <3 x i16> %brev +}