Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -39,6 +39,52 @@ Module *Mod; bool HasUnsafeFPMath; + /// \returns True if binary operation \p I is a signed binary operation, false + /// otherwise. + bool isSigned(const BinaryOperator &I) const { + return I.getOpcode() == Instruction::SDiv || + I.getOpcode() == Instruction::SRem; + } + + /// \returns True if 'icmp' operation \p I is a signed 'icmp' operation, false + /// otherwise. + bool isSigned(const ICmpInst &I) const { + return I.getPredicate() == ICmpInst::ICMP_SGT || + I.getPredicate() == ICmpInst::ICMP_SGE || + I.getPredicate() == ICmpInst::ICMP_SLT || + I.getPredicate() == ICmpInst::ICMP_SLE; + } + + /// \returns True if the condition of 'select' operation \p I comes from a + /// signed 'icmp' operation, false otherwise. + bool isSigned(const SelectInst &I) const { + return isa(I.getOperand(0)) ? + isSigned(*cast(I.getOperand(0))) : false; + } + + /// \brief Promotes uniform 16 bit binary operation \p I to equivalent 32 bit + /// binary operation by sign or zero extending operands to 32 bits, replacing + /// 16 bit operation with equivalent 32 bit operation, and truncating the + /// result of 32 bit operation back to 16 bits. + /// + /// \returns True. + bool promoteUniformI16OpToI32Op(BinaryOperator &I) const; + + /// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp' + /// operation by sign or zero extending operands to 32 bits, and replacing 16 + /// bit operation with 32 bit operation. + /// + /// \returns True. + bool promoteUniformI16OpToI32Op(ICmpInst &I) const; + + /// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select' + /// operation by sign or zero extending operands to 32 bits, replacing 16 bit + /// operation with 32 bit operation, and truncating the result of 32 bit + /// operation back to 16 bits. + /// + /// \returns True. + bool promoteUniformI16OpToI32Op(SelectInst &I) const; + public: static char ID; AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : @@ -51,9 +97,10 @@ bool visitFDiv(BinaryOperator &I); - bool visitInstruction(Instruction &I) { - return false; - } + bool visitInstruction(Instruction &I) { return false; } + bool visitBinaryOperator(BinaryOperator &I); + bool visitICmpInst(ICmpInst &I); + bool visitSelectInst(SelectInst &I); bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; @@ -70,6 +117,95 @@ } // End anonymous namespace +bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const { + assert(I.getType()->isIntegerTy(16) && "Op must be 16 bits"); + assert(DA->isUniform(&I) && "Op must be uniform"); + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *Int32Ty = Builder.getInt32Ty(); + Value *ExtOp0 = nullptr; + Value *ExtOp1 = nullptr; + Value *ExtRes = nullptr; + Value *TruncRes = nullptr; + + if (isSigned(I)) { + ExtOp0 = Builder.CreateSExt(I.getOperand(0), Int32Ty); + ExtOp1 = Builder.CreateSExt(I.getOperand(1), Int32Ty); + } else { + ExtOp0 = Builder.CreateZExt(I.getOperand(0), Int32Ty); + ExtOp1 = Builder.CreateZExt(I.getOperand(1), Int32Ty); + } + ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); + TruncRes = Builder.CreateTrunc(ExtRes, Builder.getInt16Ty()); + + I.replaceAllUsesWith(TruncRes); + I.dropAllReferences(); + I.eraseFromParent(); + + return true; +} + +bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(ICmpInst &I) const { + assert(I.getOperand(0)->getType()->isIntegerTy(16) && "Op0 must be 16 bits"); + assert(I.getOperand(1)->getType()->isIntegerTy(16) && "Op1 must be 16 bits"); + assert(DA->isUniform(&I) && "Op must be uniform"); + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *Int32Ty = Builder.getInt32Ty(); + Value *ExtOp0 = nullptr; + Value *ExtOp1 = nullptr; + Value *NewICmp = nullptr; + + if (isSigned(I)) { + ExtOp0 = Builder.CreateSExt(I.getOperand(0), Int32Ty); + ExtOp1 = Builder.CreateSExt(I.getOperand(1), Int32Ty); + } else { + ExtOp0 = Builder.CreateZExt(I.getOperand(0), Int32Ty); + ExtOp1 = Builder.CreateZExt(I.getOperand(1), Int32Ty); + } + NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); + + I.replaceAllUsesWith(NewICmp); + I.dropAllReferences(); + I.eraseFromParent(); + + return true; +} + +bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(SelectInst &I) const { + assert(I.getType()->isIntegerTy(16) && "Op must be 16 bits"); + assert(DA->isUniform(&I) && "Op must be uniform"); + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *Int32Ty = Builder.getInt32Ty(); + Value *ExtOp1 = nullptr; + Value *ExtOp2 = nullptr; + Value *ExtRes = nullptr; + Value *TruncRes = nullptr; + + if (isSigned(I)) { + ExtOp1 = Builder.CreateSExt(I.getOperand(1), Int32Ty); + ExtOp2 = Builder.CreateSExt(I.getOperand(2), Int32Ty); + } else { + ExtOp1 = Builder.CreateZExt(I.getOperand(1), Int32Ty); + ExtOp2 = Builder.CreateZExt(I.getOperand(2), Int32Ty); + } + ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); + TruncRes = Builder.CreateTrunc(ExtRes, Builder.getInt16Ty()); + + I.replaceAllUsesWith(TruncRes); + I.dropAllReferences(); + I.eraseFromParent(); + + return true; +} + static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { const ConstantFP *CNum = dyn_cast(Num); if (!CNum) @@ -154,6 +290,37 @@ return Attr.getValueAsString() == "true"; } +bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { + bool Changed = false; + + // TODO: Should we promote smaller types that will be legalized to i16? + if (I.getType()->isIntegerTy(16) && DA->isUniform(&I)) + Changed |= promoteUniformI16OpToI32Op(I); + + return Changed; +} + +bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { + bool Changed = false; + + // TODO: Should we promote smaller types that will be legalized to i16? + if (I.getOperand(0)->getType()->isIntegerTy(16) && + I.getOperand(1)->getType()->isIntegerTy(16) && DA->isUniform(&I)) + Changed |= promoteUniformI16OpToI32Op(I); + + return Changed; +} + +bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { + bool Changed = false; + + // TODO: Should we promote smaller types that will be legalized to i16? + if (I.getType()->isIntegerTy(16) && DA->isUniform(&I)) + Changed |= promoteUniformI16OpToI32Op(I); + + return Changed; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Mod = &M; return false; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -540,6 +540,10 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { + // i16 is not desirable unless it is a load or a store. + if (VT == MVT::i16 && Op != ISD::LOAD && Op != ISD::STORE) + return false; + // SimplifySetCC uses this function to determine whether or not it should // create setcc with i1 operands. We don't have instructions for i1 setcc. if (VT == MVT::i1 && Op == ISD::SETCC) Index: test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll +++ test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll @@ -231,6 +231,299 @@ ret void } +; CHECK-LABEL: @promote_uniform_add_i16( +; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[R_32:[0-9]+]] = add i32 %[[A_32]], %[[B_32]] +; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; CHECK: ret i16 %[[R_16]] +define i16 @promote_uniform_add_i16(i16 %a, i16 %b) { + %r = add i16 %a, %b + ret i16 %r +} + +; CHECK-LABEL: @promote_uniform_sub_i16( +; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[R_32:[0-9]+]] = sub i32 %[[A_32]], %[[B_32]] +; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; CHECK: ret i16 %[[R_16]] +define i16 @promote_uniform_sub_i16(i16 %a, i16 %b) { + %r = sub i16 %a, %b + ret i16 %r +} + +; CHECK-LABEL: @promote_uniform_mul_i16( +; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[R_32:[0-9]+]] = mul i32 %[[A_32]], %[[B_32]] +; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; CHECK: ret i16 %[[R_16]] +define i16 @promote_uniform_mul_i16(i16 %a, i16 %b) { + %r = mul i16 %a, %b + ret i16 %r +} + +; CHECK-LABEL: @promote_uniform_udiv_i16( +; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[R_32:[0-9]+]] = udiv i32 %[[A_32]], %[[B_32]] +; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; CHECK: ret i16 %[[R_16]] +define i16 @promote_uniform_udiv_i16(i16 %a, i16 %b) { + %r = udiv i16 %a, %b + ret i16 %r +} + +; CHECK-LABEL: @promote_uniform_sdiv_i16( +; CHECK: %[[A_32:[0-9]+]] = sext i16 %a to i32 +; CHECK: %[[B_32:[0-9]+]] = sext i16 %b to i32 +; CHECK: %[[R_32:[0-9]+]] = sdiv i32 %[[A_32]], %[[B_32]] +; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; CHECK: ret i16 %[[R_16]] +define i16 @promote_uniform_sdiv_i16(i16 %a, i16 %b) { + %r = sdiv i16 %a, %b + ret i16 %r +} + +; CHECK-LABEL: @promote_uniform_urem_i16( +; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[R_32:[0-9]+]] = urem i32 %[[A_32]], %[[B_32]] +; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; CHECK: ret i16 %[[R_16]] +define i16 @promote_uniform_urem_i16(i16 %a, i16 %b) { + %r = urem i16 %a, %b + ret i16 %r +} + +; CHECK-LABEL: @promote_uniform_srem_i16( +; CHECK: %[[A_32:[0-9]+]] = sext i16 %a to i32 +; CHECK: %[[B_32:[0-9]+]] = sext i16 %b to i32 +; CHECK: %[[R_32:[0-9]+]] = srem i32 %[[A_32]], %[[B_32]] +; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; CHECK: ret i16 %[[R_16]] +define i16 @promote_uniform_srem_i16(i16 %a, i16 %b) { + %r = srem i16 %a, %b + ret i16 %r +} + +; CHECK-LABEL: @promote_uniform_shl_i16( +; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[R_32:[0-9]+]] = shl i32 %[[A_32]], %[[B_32]] +; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; CHECK: ret i16 %[[R_16]] +define i16 @promote_uniform_shl_i16(i16 %a, i16 %b) { + %r = shl i16 %a, %b + ret i16 %r +} + +; CHECK-LABEL: @promote_uniform_lshr_i16( +; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[R_32:[0-9]+]] = lshr i32 %[[A_32]], %[[B_32]] +; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; CHECK: ret i16 %[[R_16]] +define i16 @promote_uniform_lshr_i16(i16 %a, i16 %b) { + %r = lshr i16 %a, %b + ret i16 %r +} + +; CHECK-LABEL: @promote_uniform_ashr_i16( +; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[R_32:[0-9]+]] = ashr i32 %[[A_32]], %[[B_32]] +; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; CHECK: ret i16 %[[R_16]] +define i16 @promote_uniform_ashr_i16(i16 %a, i16 %b) { + %r = ashr i16 %a, %b + ret i16 %r +} + +; CHECK-LABEL: @promote_uniform_and_i16( +; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[R_32:[0-9]+]] = and i32 %[[A_32]], %[[B_32]] +; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; CHECK: ret i16 %[[R_16]] +define i16 @promote_uniform_and_i16(i16 %a, i16 %b) { + %r = and i16 %a, %b + ret i16 %r +} + +; CHECK-LABEL: @promote_uniform_or_i16( +; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[R_32:[0-9]+]] = or i32 %[[A_32]], %[[B_32]] +; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; CHECK: ret i16 %[[R_16]] +define i16 @promote_uniform_or_i16(i16 %a, i16 %b) { + %r = or i16 %a, %b + ret i16 %r +} + +; CHECK-LABEL: @promote_uniform_xor_i16( +; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[R_32:[0-9]+]] = xor i32 %[[A_32]], %[[B_32]] +; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; CHECK: ret i16 %[[R_16]] +define i16 @promote_uniform_xor_i16(i16 %a, i16 %b) { + %r = xor i16 %a, %b + ret i16 %r +} + +; CHECK-LABEL: @promote_uniform_select_eq_i16( +; CHECK: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[CMP:[0-9]+]] = icmp eq i32 %[[A_32_0]], %[[B_32_0]] +; CHECK: %[[A_32_1:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] +; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 +; CHECK: ret i16 %[[SEL_16]] +define i16 @promote_uniform_select_eq_i16(i16 %a, i16 %b) { + %cmp = icmp eq i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +; CHECK-LABEL: @promote_uniform_select_ne_i16( +; CHECK: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[CMP:[0-9]+]] = icmp ne i32 %[[A_32_0]], %[[B_32_0]] +; CHECK: %[[A_32_1:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] +; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 +; CHECK: ret i16 %[[SEL_16]] +define i16 @promote_uniform_select_ne_i16(i16 %a, i16 %b) { + %cmp = icmp ne i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +; CHECK-LABEL: @promote_uniform_select_ugt_i16( +; CHECK: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[CMP:[0-9]+]] = icmp ugt i32 %[[A_32_0]], %[[B_32_0]] +; CHECK: %[[A_32_1:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] +; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 +; CHECK: ret i16 %[[SEL_16]] +define i16 @promote_uniform_select_ugt_i16(i16 %a, i16 %b) { + %cmp = icmp ugt i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +; CHECK-LABEL: @promote_uniform_select_uge_i16( +; CHECK: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[CMP:[0-9]+]] = icmp uge i32 %[[A_32_0]], %[[B_32_0]] +; CHECK: %[[A_32_1:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] +; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 +; CHECK: ret i16 %[[SEL_16]] +define i16 @promote_uniform_select_uge_i16(i16 %a, i16 %b) { + %cmp = icmp uge i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +; CHECK-LABEL: @promote_uniform_select_ult_i16( +; CHECK: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[CMP:[0-9]+]] = icmp ult i32 %[[A_32_0]], %[[B_32_0]] +; CHECK: %[[A_32_1:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] +; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 +; CHECK: ret i16 %[[SEL_16]] +define i16 @promote_uniform_select_ult_i16(i16 %a, i16 %b) { + %cmp = icmp ult i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +; CHECK-LABEL: @promote_uniform_select_ule_i16( +; CHECK: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[CMP:[0-9]+]] = icmp ule i32 %[[A_32_0]], %[[B_32_0]] +; CHECK: %[[A_32_1:[0-9]+]] = zext i16 %a to i32 +; CHECK: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 +; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] +; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 +; CHECK: ret i16 %[[SEL_16]] +define i16 @promote_uniform_select_ule_i16(i16 %a, i16 %b) { + %cmp = icmp ule i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +; CHECK-LABEL: @promote_uniform_select_sgt_i16( +; CHECK: %[[A_32_0:[0-9]+]] = sext i16 %a to i32 +; CHECK: %[[B_32_0:[0-9]+]] = sext i16 %b to i32 +; CHECK: %[[CMP:[0-9]+]] = icmp sgt i32 %[[A_32_0]], %[[B_32_0]] +; CHECK: %[[A_32_1:[0-9]+]] = sext i16 %a to i32 +; CHECK: %[[B_32_1:[0-9]+]] = sext i16 %b to i32 +; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] +; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 +; CHECK: ret i16 %[[SEL_16]] +define i16 @promote_uniform_select_sgt_i16(i16 %a, i16 %b) { + %cmp = icmp sgt i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +; CHECK-LABEL: @promote_uniform_select_sge_i16( +; CHECK: %[[A_32_0:[0-9]+]] = sext i16 %a to i32 +; CHECK: %[[B_32_0:[0-9]+]] = sext i16 %b to i32 +; CHECK: %[[CMP:[0-9]+]] = icmp sge i32 %[[A_32_0]], %[[B_32_0]] +; CHECK: %[[A_32_1:[0-9]+]] = sext i16 %a to i32 +; CHECK: %[[B_32_1:[0-9]+]] = sext i16 %b to i32 +; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] +; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 +; CHECK: ret i16 %[[SEL_16]] +define i16 @promote_uniform_select_sge_i16(i16 %a, i16 %b) { + %cmp = icmp sge i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +; CHECK-LABEL: @promote_uniform_select_slt_i16( +; CHECK: %[[A_32_0:[0-9]+]] = sext i16 %a to i32 +; CHECK: %[[B_32_0:[0-9]+]] = sext i16 %b to i32 +; CHECK: %[[CMP:[0-9]+]] = icmp slt i32 %[[A_32_0]], %[[B_32_0]] +; CHECK: %[[A_32_1:[0-9]+]] = sext i16 %a to i32 +; CHECK: %[[B_32_1:[0-9]+]] = sext i16 %b to i32 +; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] +; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 +; CHECK: ret i16 %[[SEL_16]] +define i16 @promote_uniform_select_slt_i16(i16 %a, i16 %b) { + %cmp = icmp slt i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +; CHECK-LABEL: @promote_uniform_select_sle_i16( +; CHECK: %[[A_32_0:[0-9]+]] = sext i16 %a to i32 +; CHECK: %[[B_32_0:[0-9]+]] = sext i16 %b to i32 +; CHECK: %[[CMP:[0-9]+]] = icmp sle i32 %[[A_32_0]], %[[B_32_0]] +; CHECK: %[[A_32_1:[0-9]+]] = sext i16 %a to i32 +; CHECK: %[[B_32_1:[0-9]+]] = sext i16 %b to i32 +; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] +; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 +; CHECK: ret i16 %[[SEL_16]] +define i16 @promote_uniform_select_sle_i16(i16 %a, i16 %b) { + %cmp = icmp sle i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + attributes #0 = { nounwind optnone noinline } attributes #1 = { nounwind } attributes #2 = { nounwind "target-features"="+fp32-denormals" } Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -244,7 +244,7 @@ ; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1: ; SI: buffer_load_ushort [[VAL:v[0-9]+]], ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] -; SI: buffer_store_short [[FFBH]], +; SI: buffer_store_short define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { %val = load i16, i16 addrspace(1)* %valptr %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone Index: test/CodeGen/AMDGPU/mul_uint24.ll =================================================================== --- test/CodeGen/AMDGPU/mul_uint24.ll +++ test/CodeGen/AMDGPU/mul_uint24.ll @@ -23,8 +23,8 @@ ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x ; EG: 16 -; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16 +; SI: s_mul_i32 +; SI: s_sext_i32_i16 define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %mul = mul i16 %a, %b @@ -33,10 +33,22 @@ ret void } +; FUNC-LABEL: {{^}}test_umul24_2xi16_sext: +; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16 +define void @test_umul24_2xi16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) { +entry: + %mul = mul <2 x i16> %a, %b + %ext = sext <2 x i16> %mul to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}test_umul24_i16: +; SI: s_mul_i32 ; SI: s_and_b32 -; SI: v_mul_u32_u24_e32 -; SI: v_and_b32_e32 +; SI: v_mov_b32_e32 + define void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %mul = mul i16 %a, %b @@ -45,6 +57,18 @@ ret void } +; FUNC-LABEL: {{^}}test_umul24_2xi16: +; SI: v_and_b32 +; SI: v_mul_u32_u24_e32 +; SI: v_and_b32_e32 +define void @test_umul24_2xi16(<2 x i32> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) { +entry: + %mul = mul <2 x i16> %a, %b + %ext = zext <2 x i16> %mul to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}test_umul24_i8: ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] ; The result must be sign-extended Index: test/CodeGen/AMDGPU/sad.ll =================================================================== --- test/CodeGen/AMDGPU/sad.ll +++ test/CodeGen/AMDGPU/sad.ll @@ -202,17 +202,18 @@ ret void } -; GCN-LABEL: {{^}}v_sad_u32_i16_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) { - %icmp0 = icmp ugt i16 %a, %b - %sub0 = sub i16 %a, %b - %sub1 = sub i16 %b, %a - %ret0 = select i1 %icmp0, i16 %sub0, i16 %sub1 +; GCN-LABEL: {{^}}v_sad_u32_2xi16_pat2: +; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_sad_u32_2xi16_pat2(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { + %icmp0 = icmp ugt <2 x i16> %a, %b + %sub0 = sub <2 x i16> %a, %b + %sub1 = sub <2 x i16> %b, %a + %ret0 = select <2 x i1> %icmp0, <2 x i16> %sub0, <2 x i16> %sub1 - %ret = add i16 %ret0, %c + %ret = add <2 x i16> %ret0, %c - store i16 %ret, i16 addrspace(1)* %out + store <2 x i16> %ret, <2 x i16> addrspace(1)* %out ret void }