Index: llvm/lib/CodeGen/GlobalISel/Utils.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -647,6 +647,28 @@ break; } + case TargetOpcode::G_BUILD_VECTOR: { + // TODO: Probably should have a recursion depth guard since you could have + // bitcasted vector elements. + for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) { + if (!isKnownToBeAPowerOfTwo(MI.getOperand(I).getReg(), MRI, KB)) + return false; + } + + return true; + } + case TargetOpcode::G_BUILD_VECTOR_TRUNC: { + // Only handle constants since we would need to know if number of leading + // zeros is greater than the truncation amount. + const unsigned BitWidth = Ty.getScalarSizeInBits(); + for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) { + auto Const = getConstantVRegVal(MI.getOperand(I).getReg(), MRI); + if (!Const || !Const->zextOrTrunc(BitWidth).isPowerOf2()) + return false; + } + + return true; + } default: break; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir @@ -143,12 +143,44 @@ ; GCN: %var:_(<2 x s16>) = COPY $vgpr0 ; GCN: %four:_(s32) = G_CONSTANT i32 4 ; GCN: %four_vec:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %four(s32), %four(s32) + ; GCN: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GCN: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD %four_vec, [[BUILD_VECTOR]] + ; GCN: %rem:_(<2 x s16>) = G_AND %var, [[ADD]] + ; GCN: $vgpr0 = COPY %rem(<2 x s16>) + %var:_(<2 x s16>) = COPY $vgpr0 + %shift_amt:_(s32) = COPY $vgpr1 + %four:_(s32) = G_CONSTANT i32 4 + %four_vec:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %four, %four + %rem:_(<2 x s16>) = G_UREM %var, %four_vec + $vgpr0 = COPY %rem +... + +# The shl is a known power of two, but we do not know if the final +# value is a power of 2 due to the truncation. +--- +name: urem_v2s16_var_nonconst_build_vector_trunc +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GCN-LABEL: name: urem_v2s16_var_nonconst_build_vector_trunc + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN: %var:_(<2 x s16>) = COPY $vgpr0 + ; GCN: %shift_amt:_(<2 x s16>) = COPY $vgpr1 + ; GCN: %two:_(s32) = G_CONSTANT i32 2 + ; GCN: %four:_(s32) = G_CONSTANT i32 4 + ; GCN: %shift:_(s32) = G_SHL %two, %shift_amt(<2 x s16>) + ; GCN: %four_vec:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %four(s32), %shift(s32) ; GCN: %rem:_(<2 x s16>) = G_UREM %var, %four_vec ; GCN: $vgpr0 = COPY %rem(<2 x s16>) %var:_(<2 x s16>) = COPY $vgpr0 %shift_amt:_(<2 x s16>) = COPY $vgpr1 + %two:_(s32) = G_CONSTANT i32 2 %four:_(s32) = G_CONSTANT i32 4 - %four_vec:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %four, %four + %shift:_(s32) = G_SHL %two, %shift_amt + %four_vec:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %four, %shift %rem:_(<2 x s16>) = G_UREM %var, %four_vec $vgpr0 = COPY %rem ... @@ -190,7 +222,10 @@ ; GCN: %pow2_1:_(s32) = G_CONSTANT i32 4096 ; GCN: %pow2_2:_(s32) = G_CONSTANT i32 2048 ; GCN: %pow2_vec:_(<2 x s32>) = G_BUILD_VECTOR %pow2_1(s32), %pow2_2(s32) - ; GCN: %rem:_(<2 x s32>) = G_UREM %var, %pow2_vec + ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32) + ; GCN: [[ADD:%[0-9]+]]:_(<2 x s32>) = G_ADD %pow2_vec, [[BUILD_VECTOR]] + ; GCN: %rem:_(<2 x s32>) = G_AND %var, [[ADD]] ; GCN: $vgpr0_vgpr1 = COPY %rem(<2 x s32>) %var:_(<2 x s32>) = COPY $vgpr0_vgpr1 %pow2_1:_(s32) = G_CONSTANT i32 4096