Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -49,8 +49,16 @@ Mod(nullptr), HasUnsafeFPMath(false) { } + /// \brief Promotes uniform 16 bit operation to equivalent 32 bit operation by + /// zero extending operands to 32 bits, replacing 16 bit operation with + /// equivalent 32 bit operation, and truncating the result of 32 bit operation + /// back to 16 bits. Always returns true. + bool promoteUniformI16OpToI32Op(BinaryOperator &I) const; + bool visitFDiv(BinaryOperator &I); + bool visitBinaryOperator(BinaryOperator &I); + bool visitInstruction(Instruction &I) { return false; } @@ -79,6 +87,25 @@ return UnsafeDiv || CNum->isExactlyValue(+1.0); } +bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const { + assert(DA->isUniform(&I) && "Op must be uniform"); + assert(I.getType()->isIntegerTy(16) && "Op must be 16 bits"); + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Value *ZExtOp0 = Builder.CreateZExt(I.getOperand(0), Builder.getInt32Ty()); + Value *ZExtOp1 = Builder.CreateZExt(I.getOperand(1), Builder.getInt32Ty()); + Value *ZExtRes = Builder.CreateBinOp(I.getOpcode(), ZExtOp0, ZExtOp1); + Value *TruncRes = Builder.CreateTrunc(ZExtRes, Builder.getInt16Ty()); + + I.replaceAllUsesWith(TruncRes); + I.dropAllReferences(); + I.eraseFromParent(); + + return true; +} + // Insert an intrinsic for fast fdiv for safe math situations where we can // reduce precision. Leave fdiv for situations where the generic node is // expected to be optimized. @@ -149,6 +176,16 @@ return true; } +bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { + bool Changed = false; + + // Promote uniform 16 bit operation to equivalent 32 bit operation. + if (DA->isUniform(&I) && I.getType()->isIntegerTy(16)) + Changed |= promoteUniformI16OpToI32Op(I); + + return Changed; +} + static bool hasUnsafeFPMath(const Function &F) { Attribute Attr = F.getFnAttribute("unsafe-fp-math"); return Attr.getValueAsString() == "true"; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -540,6 +540,10 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { + // i16 is not desirable unless it is a load or a store. + if (VT == MVT::i16 && Op != ISD::LOAD && Op != ISD::STORE) + return false; + // SimplifySetCC uses this function to determine whether or not it should // create setcc with i1 operands. We don't have instructions for i1 setcc. if (VT == MVT::i1 && Op == ISD::SETCC) Index: test/CodeGen/AMDGPU/mul_uint24.ll =================================================================== --- test/CodeGen/AMDGPU/mul_uint24.ll +++ test/CodeGen/AMDGPU/mul_uint24.ll @@ -23,8 +23,8 @@ ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x ; EG: 16 -; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16 +; SI: s_mul_i32 +; SI: s_sext_i32_i16 define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %mul = mul i16 %a, %b @@ -34,9 +34,9 @@ } ; FUNC-LABEL: {{^}}test_umul24_i16: +; SI: s_mul_i32 ; SI: s_and_b32 -; SI: v_mul_u32_u24_e32 -; SI: v_and_b32_e32 +; SI: v_mov_b32_e32 define void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %mul = mul i16 %a, %b Index: test/CodeGen/AMDGPU/sdivrem24.ll =================================================================== --- test/CodeGen/AMDGPU/sdivrem24.ll +++ test/CodeGen/AMDGPU/sdivrem24.ll @@ -22,10 +22,10 @@ } ; FUNC-LABEL: {{^}}sdiv24_i16: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: v_rcp_f32_e32 +; SI: v_cvt_u32_f32_e32 ; EG: INT_TO_FLT ; EG-DAG: INT_TO_FLT @@ -140,10 +140,10 @@ } ; FUNC-LABEL: {{^}}srem24_i16: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: v_rcp_f32_e32 +; SI: v_cvt_u32_f32_e32 ; EG: INT_TO_FLT ; EG-DAG: INT_TO_FLT