Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" @@ -131,6 +132,15 @@ /// /// \returns True. bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; + + /// Expands 24 bit div or rem. + Value* expandDivRem24(IRBuilder<> &Builder, Value *Num, Value *Den, + bool IsDiv, bool IsSigned) const; + + /// Expands 32 bit div or rem. + Value* expandDivRem32(IRBuilder<> &Builder, Instruction::BinaryOps Opc, + Value *Num, Value *Den) const; + /// Widen a scalar load. /// /// \details \p Widen scalar load for uniform, small type loads from constant @@ -256,7 +266,9 @@ "I does not need promotion to i32"); if (I.getOpcode() == Instruction::SDiv || - I.getOpcode() == Instruction::UDiv) + I.getOpcode() == Instruction::UDiv || + I.getOpcode() == Instruction::SRem || + I.getOpcode() == Instruction::URem) return false; IRBuilder<> Builder(&I); @@ -467,12 +479,314 @@ return Attr.getValueAsString() == "true"; } +static std::pair getMul64(IRBuilder<> &Builder, + Value *LHS, Value *RHS) { + Type *I32Ty = Builder.getInt32Ty(); + Type *I64Ty = Builder.getInt64Ty(); + ConstantInt *Zero = Builder.getInt32(0); + ConstantInt *One = Builder.getInt32(1); + + Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty); + Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty); + Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64); + Value *MUL64_HILO = Builder.CreateBitCast(MUL64, VectorType::get(I32Ty, 2)); + return std::make_pair(Builder.CreateExtractElement(MUL64_HILO, Zero), + Builder.CreateExtractElement(MUL64_HILO, One)); +} + +static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { + return getMul64(Builder, LHS, RHS).second; +} + +// The fractional part of a float is enough to accurately represent up to +// a 24-bit signed integer. +Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, + Value *Num, Value *Den, + bool IsDiv, bool IsSigned) const { + assert(Num->getType()->isIntegerTy(32)); + + const DataLayout &DL = Mod->getDataLayout(); + unsigned LHSSignBits = ComputeNumSignBits(Num, DL); + if (LHSSignBits < 9) + return nullptr; + + unsigned RHSSignBits = ComputeNumSignBits(Den, DL); + if (RHSSignBits < 9) + return nullptr; + + + unsigned SignBits = std::min(LHSSignBits, RHSSignBits); + unsigned DivBits = 32 - SignBits; + if (IsSigned) + ++DivBits; + + Type *Ty = Num->getType(); + Type *I32Ty = Builder.getInt32Ty(); + Type *F32Ty = Builder.getFloatTy(); + ConstantInt *One = Builder.getInt32(1); + Value *JQ = One; + + if (IsSigned) { + // char|short jq = ia ^ ib; + JQ = Builder.CreateXor(Num, Den); + + // jq = jq >> (bitsize - 2) + JQ = Builder.CreateAShr(JQ, Builder.getInt32(30)); + + // jq = jq | 0x1 + JQ = Builder.CreateOr(JQ, One); + } + + // int ia = (int)LHS; + Value *IA = Num; + + // int ib, (int)RHS; + Value *IB = Den; + + // float fa = (float)ia; + Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty) + : Builder.CreateUIToFP(IA, F32Ty); + + // float fb = (float)ib; + Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) + : Builder.CreateUIToFP(IB,F32Ty); + + Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB); + Value *FQ = Builder.CreateFMul(FA, RCP); + + // fq = trunc(fq); + Function *Trunc = Intrinsic::getDeclaration(Mod, Intrinsic::trunc, { F32Ty }); + FQ = Builder.CreateCall(Trunc, { FQ }); + + // float fqneg = -fq; + Value *FQNeg = Builder.CreateFNeg(FQ); + + // float fr = mad(fqneg, fb, fa); + Function *MAD = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fmad_ftz); + Value *FR = Builder.CreateCall(MAD, { FQNeg, FB, FA }); + + // int iq = (int)fq; + Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty) + : Builder.CreateFPToUI(FQ, I32Ty); + + // fr = fabs(fr); + Function *FAbs = Intrinsic::getDeclaration(Mod, Intrinsic::fabs, { F32Ty }); + FR = Builder.CreateCall(FAbs, { FR }); + + // fb = fabs(fb); + FB = Builder.CreateCall(FAbs, { FB }); + + // int cv = fr >= fb; + Value *CV = Builder.CreateFCmpOGE(FR, FB); + + // jq = (cv ? jq : 0); + JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0)); + + // dst = iq + jq; + Value *Div = Builder.CreateAdd(IQ, JQ); + + Value *Res = Div; + if (!IsDiv) { + // Rem needs compensation, it's easier to recompute it + Value *Rem = Builder.CreateMul(Div, Den); + Res = Builder.CreateSub(Num, Rem); + } + + // Truncate to number of bits this divide really is. + if (IsSigned) { + Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits)); + Res = Builder.CreateSExt(Res, Ty); + } else { + ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1); + Res = Builder.CreateAnd(Res, TruncMask); + } + + return Res; +} + +Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, + Instruction::BinaryOps Opc, + Value *Num, Value *Den) const { + assert(Opc == Instruction::URem || Opc == Instruction::UDiv || + Opc == Instruction::SRem || Opc == Instruction::SDiv); + + FastMathFlags FMF; + FMF.setFast(); + Builder.setFastMathFlags(FMF); + + if (isa(Den)) + return nullptr; // Keep it for optimization + + bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; + bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; + + Type *Ty = Num->getType(); + Type *I32Ty = Builder.getInt32Ty(); + Type *F32Ty = Builder.getFloatTy(); + + if (Ty->getScalarSizeInBits() < 32) { + if (IsSigned) { + Num = Builder.CreateSExt(Num, I32Ty); + Den = Builder.CreateSExt(Den, I32Ty); + } else { + Num = Builder.CreateZExt(Num, I32Ty); + Den = Builder.CreateZExt(Den, I32Ty); + } + } + + if (Value *Res = expandDivRem24(Builder, Num, Den, IsDiv, IsSigned)) { + Res = Builder.CreateTrunc(Res, Ty); + return Res; + } + + ConstantInt *Zero = Builder.getInt32(0); + ConstantInt *One = Builder.getInt32(1); + ConstantInt *MinusOne = Builder.getInt32(~0); + + Value *Sign = nullptr; + if (IsSigned) { + ConstantInt *K31 = Builder.getInt32(31); + Value *LHSign = Builder.CreateAShr(Num, K31); + Value *RHSign = Builder.CreateAShr(Den, K31); + // Remainder sign is the same as LHS + Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign; + + Num = Builder.CreateAdd(Num, LHSign); + Den = Builder.CreateAdd(Den, RHSign); + + Num = Builder.CreateXor(Num, LHSign); + Den = Builder.CreateXor(Den, RHSign); + } + + // RCP = URECIP(Den) = 2^32 / Den + e + // e is rounding error. + Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty); + Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32); + Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000)); + Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1); + Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty); + + // RCP_LO, RCP_HI = mul(RCP, Den) */ + Value *RCP_LO, *RCP_HI; + std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den); + + // NEG_RCP_LO = -RCP_LO + Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO); + + // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) + Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero); + Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO); + + // Calculate the rounding error from the URECIP instruction + // E = mulhu(ABS_RCP_LO, RCP) + Value *E = getMulHu(Builder, ABS_RCP_LO, RCP); + + // RCP_A_E = RCP + E + Value *RCP_A_E = Builder.CreateAdd(RCP, E); + + // RCP_S_E = RCP - E + Value *RCP_S_E = Builder.CreateSub(RCP, E); + + // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) + Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E); + + // Quotient = mulhu(Tmp0, Num) + Value *Quotient = getMulHu(Builder, Tmp0, Num); + + // Num_S_Remainder = Quotient * Den + Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den); + + // Remainder = Num - Num_S_Remainder + Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder); + + // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) + Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den); + Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero); + + // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) + Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder); + Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, + MinusOne, Zero); + + // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero + Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero); + Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero); + + Value *Res; + if (IsDiv) { + // Quotient_A_One = Quotient + 1 + Value *Quotient_A_One = Builder.CreateAdd(Quotient, One); + + // Quotient_S_One = Quotient - 1 + Value *Quotient_S_One = Builder.CreateSub(Quotient, One); + + // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) + Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One); + + // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) + Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One); + } else { + // Remainder_S_Den = Remainder - Den + Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den); + + // Remainder_A_Den = Remainder + Den + Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den); + + // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) + Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den); + + // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) + Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den); + } + + if (IsSigned) { + Res = Builder.CreateXor(Res, Sign); + Res = Builder.CreateSub(Res, Sign); + } + + Res = Builder.CreateTrunc(Res, Ty); + + return Res; +} + bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { + if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && + DA->isUniform(&I) && promoteUniformOpToI32(I)) + return true; + bool Changed = false; + Instruction::BinaryOps Opc = I.getOpcode(); + Type *Ty = I.getType(); + Value *NewDiv = nullptr; + if ((Opc == Instruction::URem || Opc == Instruction::UDiv || + Opc == Instruction::SRem || Opc == Instruction::SDiv) && + Ty->getScalarSizeInBits() <= 32) { + Value *Num = I.getOperand(0); + Value *Den = I.getOperand(1); + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); - if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && - DA->isUniform(&I)) - Changed |= promoteUniformOpToI32(I); + if (VectorType *VT = dyn_cast(Ty)) { + NewDiv = UndefValue::get(VT); + + for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { + Value *NumEltI = Builder.CreateExtractElement(Num, I); + Value *DenEltI = Builder.CreateExtractElement(Den, I); + Value *NewElt = expandDivRem32(Builder, Opc, NumEltI, DenEltI); + if (!NewElt) + NewElt = Builder.CreateBinOp(Opc, NumEltI, DenEltI); + NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, I); + } + } else { + NewDiv = expandDivRem32(Builder, Opc, Num, Den); + } + + if (NewDiv) { + I.replaceAllUsesWith(NewDiv); + I.eraseFromParent(); + Changed = true; + } + } return Changed; } Index: test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll +++ test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll @@ -170,13 +170,8 @@ } ; GCN-LABEL: @urem_i3( -; SI: %r = urem i3 %a, %b -; SI-NEXT: store volatile i3 %r -; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 -; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = urem i32 %[[A_32]], %[[B_32]] -; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: store volatile i3 %[[R_3]] +; GCN: uitofp i32 %{{[^ ]+}} to float +; GCN: fdiv fast float 1.000000e+00, define amdgpu_kernel void @urem_i3(i3 %a, i3 %b) { %r = urem i3 %a, %b store volatile i3 %r, i3 addrspace(1)* undef @@ -184,13 +179,8 @@ } ; GCN-LABEL: @srem_i3( -; SI: %r = srem i3 %a, %b -; SI-NEXT: store volatile i3 %r -; VI: %[[A_32:[0-9]+]] = sext i3 %a to i32 -; VI-NEXT: %[[B_32:[0-9]+]] = sext i3 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = srem i32 %[[A_32]], %[[B_32]] -; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: store volatile i3 %[[R_3]] +; GCN: sitofp i32 %{{[^ ]+}} to float +; GCN: fdiv fast float 1.000000e+00, define amdgpu_kernel void @srem_i3(i3 %a, i3 %b) { %r = srem i3 %a, %b store volatile i3 %r, i3 addrspace(1)* undef @@ -749,13 +739,8 @@ } ; GCN-LABEL: @urem_i16( -; SI: %r = urem i16 %a, %b -; SI-NEXT: store volatile i16 %r -; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 -; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = urem i32 %[[A_32]], %[[B_32]] -; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: store volatile i16 %[[R_16]] +; GCN: uitofp i32 %{{[^ ]+}} to float +; GCN: fdiv fast float 1.000000e+00, define amdgpu_kernel void @urem_i16(i16 %a, i16 %b) { %r = urem i16 %a, %b store volatile i16 %r, i16 addrspace(1)* undef @@ -763,13 +748,8 @@ } ; GCN-LABEL: @srem_i16( -; SI: %r = srem i16 %a, %b -; SI-NEXT: store volatile i16 %r -; VI: %[[A_32:[0-9]+]] = sext i16 %a to i32 -; VI-NEXT: %[[B_32:[0-9]+]] = sext i16 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = srem i32 %[[A_32]], %[[B_32]] -; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: store volatile i16 %[[R_16]] +; GCN: sitofp i32 %{{[^ ]+}} to float +; GCN: fdiv fast float 1.000000e+00, define amdgpu_kernel void @srem_i16(i16 %a, i16 %b) { %r = srem i16 %a, %b store volatile i16 %r, i16 addrspace(1)* undef @@ -1313,13 +1293,10 @@ } ; GCN-LABEL: @urem_3xi15( -; SI: %r = urem <3 x i15> %a, %b -; SI-NEXT: store volatile <3 x i15> %r -; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> -; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = urem <3 x i32> %[[A_32]], %[[B_32]] -; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: store volatile <3 x i15> %[[R_15]] +; GCN: uitofp i32 %{{[^ ]+}} to float +; GCN: fdiv fast float 1.000000e+00, +; GCN: fdiv fast float 1.000000e+00, +; GCN: fdiv fast float 1.000000e+00, define amdgpu_kernel void @urem_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = urem <3 x i15> %a, %b store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef @@ -1327,13 +1304,10 @@ } ; GCN-LABEL: @srem_3xi15( -; SI: %r = srem <3 x i15> %a, %b -; SI-NEXT: store volatile <3 x i15> %r -; VI: %[[A_32:[0-9]+]] = sext <3 x i15> %a to <3 x i32> -; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i15> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = srem <3 x i32> %[[A_32]], %[[B_32]] -; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: store volatile <3 x i15> %[[R_15]] +; GCN: sitofp i32 %{{[^ ]+}} to float +; GCN: fdiv fast float 1.000000e+00, +; GCN: fdiv fast float 1.000000e+00, +; GCN: fdiv fast float 1.000000e+00, define amdgpu_kernel void @srem_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = srem <3 x i15> %a, %b store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef @@ -1868,13 +1842,10 @@ } ; GCN-LABEL: @urem_3xi16( -; SI: %r = urem <3 x i16> %a, %b -; SI-NEXT: store volatile <3 x i16> %r -; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> -; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = urem <3 x i32> %[[A_32]], %[[B_32]] -; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: store volatile <3 x i16> %[[R_16]] +; GCN: uitofp i32 %{{[^ ]+}} to float +; GCN: fdiv fast float 1.000000e+00, +; GCN: fdiv fast float 1.000000e+00, +; GCN: fdiv fast float 1.000000e+00, define amdgpu_kernel void @urem_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = urem <3 x i16> %a, %b store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef @@ -1882,13 +1853,8 @@ } ; GCN-LABEL: @srem_3xi16( -; SI: %r = srem <3 x i16> %a, %b -; SI-NEXT: store volatile <3 x i16> %r -; VI: %[[A_32:[0-9]+]] = sext <3 x i16> %a to <3 x i32> -; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i16> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = srem <3 x i32> %[[A_32]], %[[B_32]] -; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: store volatile <3 x i16> %[[R_16]] +; GCN: sitofp i32 %{{[^ ]+}} to float +; GCN: fdiv fast float 1.000000e+00, define amdgpu_kernel void @srem_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = srem <3 x i16> %a, %b store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef Index: test/CodeGen/AMDGPU/dagcombine-select.ll =================================================================== --- test/CodeGen/AMDGPU/dagcombine-select.ll +++ test/CodeGen/AMDGPU/dagcombine-select.ll @@ -157,37 +157,37 @@ ; GCN-LABEL: {{^}}sdiv_constant_sel_constants: ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 5, 0, -define amdgpu_kernel void @sdiv_constant_sel_constants(i32 addrspace(1)* %p, i1 %cond) { - %sel = select i1 %cond, i32 121, i32 23 - %bo = sdiv i32 120, %sel - store i32 %bo, i32 addrspace(1)* %p, align 4 +define amdgpu_kernel void @sdiv_constant_sel_constants(i64 addrspace(1)* %p, i1 %cond) { + %sel = select i1 %cond, i64 121, i64 23 + %bo = sdiv i64 120, %sel + store i64 %bo, i64 addrspace(1)* %p, align 8 ret void } ; GCN-LABEL: {{^}}udiv_constant_sel_constants: ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 5, 0, -define amdgpu_kernel void @udiv_constant_sel_constants(i32 addrspace(1)* %p, i1 %cond) { - %sel = select i1 %cond, i32 -4, i32 23 - %bo = udiv i32 120, %sel - store i32 %bo, i32 addrspace(1)* %p, align 4 +define amdgpu_kernel void @udiv_constant_sel_constants(i64 addrspace(1)* %p, i1 %cond) { + %sel = select i1 %cond, i64 -4, i64 23 + %bo = udiv i64 120, %sel + store i64 %bo, i64 addrspace(1)* %p, align 8 ret void } ; GCN-LABEL: {{^}}srem_constant_sel_constants: ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 3, 33, -define amdgpu_kernel void @srem_constant_sel_constants(i32 addrspace(1)* %p, i1 %cond) { - %sel = select i1 %cond, i32 34, i32 15 - %bo = srem i32 33, %sel - store i32 %bo, i32 addrspace(1)* %p, align 4 +define amdgpu_kernel void @srem_constant_sel_constants(i64 addrspace(1)* %p, i1 %cond) { + %sel = select i1 %cond, i64 34, i64 15 + %bo = srem i64 33, %sel + store i64 %bo, i64 addrspace(1)* %p, align 8 ret void } ; GCN-LABEL: {{^}}urem_constant_sel_constants: ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 3, 33, -define amdgpu_kernel void @urem_constant_sel_constants(i32 addrspace(1)* %p, i1 %cond) { - %sel = select i1 %cond, i32 34, i32 15 - %bo = urem i32 33, %sel - store i32 %bo, i32 addrspace(1)* %p, align 4 +define amdgpu_kernel void @urem_constant_sel_constants(i64 addrspace(1)* %p, i1 %cond) { + %sel = select i1 %cond, i64 34, i64 15 + %bo = urem i64 33, %sel + store i64 %bo, i64 addrspace(1)* %p, align 8 ret void }