diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -50,6 +50,7 @@ private: bool optimizeZExt(ZExtInst *I); + bool optimizeAndExt(BinaryOperator *BO); }; } // end anonymous namespace @@ -83,6 +84,57 @@ return false; } +// Try to optimize (i64 (and (zext/sext (i32 X), C1))) if C1 has bit 31 set, +// but bits 63:32 are zero. If we can prove that bit 31 of X is 0, we can fill +// the upper 32 bits with ones. A separate transform will turn (zext X) into +// (sext X) for the same condition. +bool RISCVCodeGenPrepare::optimizeAndExt(BinaryOperator *BO) { + if (!ST->is64Bit()) + return false; + + if (BO->getOpcode() != Instruction::And) + return false; + + if (!BO->getType()->isIntegerTy(64)) + return false; + + // Left hand side should be sext or zext. + Instruction *LHS = dyn_cast(BO->getOperand(0)); + if (!LHS || (!isa(LHS) && !isa(LHS))) + return false; + + Value *LHSSrc = LHS->getOperand(0); + if (!LHSSrc->getType()->isIntegerTy(32)) + return false; + + // Right hand side should be a constant. + Value *RHS = BO->getOperand(1); + + auto *CI = dyn_cast(RHS); + if (!CI) + return false; + uint64_t C = CI->getZExtValue(); + + // Look for constants that fit in 32 bits but not simm12, and can be made + // into simm12 by sign extending bit 31. This will allow use of ANDI. + // TODO: Is worth making simm32? + if (!isUInt<32>(C) || isInt<12>(C) || !isInt<12>(SignExtend64<32>(C))) + return false; + + // If we can determine the sign bit of the input is 0, we can replace the + // And mask constant. + if (!isImpliedByDomCondition(ICmpInst::ICMP_SGE, LHSSrc, + Constant::getNullValue(LHSSrc->getType()), + LHS, *DL)) + return false; + + // Sign extend the constant and replace the And operand. + C = SignExtend64<32>(C); + BO->setOperand(1, ConstantInt::get(LHS->getType(), C)); + + return true; +} + bool RISCVCodeGenPrepare::runOnFunction(Function &F) { if (skipFunction(F)) return false; @@ -98,6 +150,8 @@ for (Instruction &I : llvm::make_early_inc_range(BB)) { if (auto *ZExt = dyn_cast(&I)) MadeChange |= optimizeZExt(ZExt); + else if (I.getOpcode() == Instruction::And) + MadeChange |= optimizeAndExt(cast(&I)); } } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -166,7 +166,6 @@ } void addIRPasses() override; - void addCodeGenPrepare() override; bool addPreISel() override; bool addInstSelector() override; bool addIRTranslator() override; @@ -192,13 +191,10 @@ if (getOptLevel() != CodeGenOpt::None) addPass(createRISCVGatherScatterLoweringPass()); - TargetPassConfig::addIRPasses(); -} - -void RISCVPassConfig::addCodeGenPrepare() { if (getOptLevel() != CodeGenOpt::None) addPass(createRISCVCodeGenPreparePass()); - TargetPassConfig::addCodeGenPrepare(); + + TargetPassConfig::addIRPasses(); } bool RISCVPassConfig::addPreISel() { diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -26,6 +26,7 @@ ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: RISCV gather/scatter lowering +; CHECK-NEXT: RISCV CodeGenPrepare ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Canonicalize natural loops @@ -57,7 +58,6 @@ ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: TLS Variable Hoist -; CHECK-NEXT: RISCV CodeGenPrepare ; CHECK-NEXT: CodeGen Prepare ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Exception handling preparation diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll --- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll +++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll @@ -40,3 +40,88 @@ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } + +; Make sure we convert the 4294967294 in for.body.preheader.new to -2 based on +; the upper 33 bits being zero by the dominating condition %cmp3. +define void @test2(ptr nocapture noundef %a, i32 noundef signext %n) { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: blez a1, .LBB1_7 +; CHECK-NEXT: # %bb.1: # %for.body.preheader +; CHECK-NEXT: li a3, 1 +; CHECK-NEXT: andi a2, a1, 1 +; CHECK-NEXT: bne a1, a3, .LBB1_3 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: j .LBB1_5 +; CHECK-NEXT: .LBB1_3: # %for.body.preheader.new +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: andi a1, a1, -2 +; CHECK-NEXT: addi a4, a0, 4 +; CHECK-NEXT: .LBB1_4: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lw a5, -4(a4) +; CHECK-NEXT: lw a6, 0(a4) +; CHECK-NEXT: addiw a5, a5, 4 +; CHECK-NEXT: sw a5, -4(a4) +; CHECK-NEXT: addiw a5, a6, 4 +; CHECK-NEXT: sw a5, 0(a4) +; CHECK-NEXT: addi a3, a3, 2 +; CHECK-NEXT: addi a4, a4, 8 +; CHECK-NEXT: bne a1, a3, .LBB1_4 +; CHECK-NEXT: .LBB1_5: # %for.cond.cleanup.loopexit.unr-lcssa +; CHECK-NEXT: beqz a2, .LBB1_7 +; CHECK-NEXT: # %bb.6: # %for.body.epil +; CHECK-NEXT: slli a1, a3, 2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: lw a1, 0(a0) +; CHECK-NEXT: addiw a1, a1, 4 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: .LBB1_7: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %cmp3 = icmp sgt i32 %n, 0 + br i1 %cmp3, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + %xtraiter = and i64 %wide.trip.count, 1 + %0 = icmp eq i32 %n, 1 + br i1 %0, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new + +for.body.preheader.new: ; preds = %for.body.preheader + %unroll_iter = and i64 %wide.trip.count, 4294967294 + br label %for.body + +for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader + %indvars.iv.unr = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next.1, %for.body ] + %lcmp.mod.not = icmp eq i64 %xtraiter, 0 + br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil + +for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa + %arrayidx.epil = getelementptr inbounds i32, ptr %a, i64 %indvars.iv.unr + %1 = load i32, ptr %arrayidx.epil, align 4 + %add.epil = add nsw i32 %1, 4 + store i32 %add.epil, ptr %arrayidx.epil, align 4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader.new + %indvars.iv = phi i64 [ 0, %for.body.preheader.new ], [ %indvars.iv.next.1, %for.body ] + %niter = phi i64 [ 0, %for.body.preheader.new ], [ %niter.next.1, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + %2 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %2, 4 + store i32 %add, ptr %arrayidx, align 4 + %indvars.iv.next = or i64 %indvars.iv, 1 + %arrayidx.1 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv.next + %3 = load i32, ptr %arrayidx.1, align 4 + %add.1 = add nsw i32 %3, 4 + store i32 %add.1, ptr %arrayidx.1, align 4 + %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2 + %niter.next.1 = add i64 %niter, 2 + %niter.ncmp.1 = icmp eq i64 %niter.next.1, %unroll_iter + br i1 %niter.ncmp.1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body +} diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll --- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll +++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll @@ -51,3 +51,94 @@ %exitcond.not = icmp eq i64 %lsr.iv.next, 0 br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body } + +; Make sure we convert the 4294967294 in for.body.preheader.new to -2 based on +; the upper 33 bits being zero by the dominating condition %cmp3. +define void @test2(ptr nocapture noundef %a, i32 noundef signext %n) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[N]], 1 +; CHECK-NEXT: br i1 [[TMP0]], label [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY_PREHEADER_NEW:%.*]] +; CHECK: for.body.preheader.new: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], -2 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit.unr-lcssa: +; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_EPIL:%.*]] +; CHECK: for.body.epil: +; CHECK-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV_UNR]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX_EPIL]], align 4 +; CHECK-NEXT: [[ADD_EPIL:%.*]] = add nsw i32 [[TMP1]], 4 +; CHECK-NEXT: store i32 [[ADD_EPIL]], ptr [[ARRAYIDX_EPIL]], align 4 +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_1]], [[FOR_BODY]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 4 +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP3]], 4 +; CHECK-NEXT: store i32 [[ADD_1]], ptr [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2 +; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]] +; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]], label [[FOR_BODY]] +; +entry: + %cmp3 = icmp sgt i32 %n, 0 + br i1 %cmp3, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + %xtraiter = and i64 %wide.trip.count, 1 + %0 = icmp eq i32 %n, 1 + br i1 %0, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new + +for.body.preheader.new: ; preds = %for.body.preheader + %unroll_iter = and i64 %wide.trip.count, 4294967294 + br label %for.body + +for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader + %indvars.iv.unr = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next.1, %for.body ] + %lcmp.mod.not = icmp eq i64 %xtraiter, 0 + br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil + +for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa + %arrayidx.epil = getelementptr inbounds i32, ptr %a, i64 %indvars.iv.unr + %1 = load i32, ptr %arrayidx.epil, align 4 + %add.epil = add nsw i32 %1, 4 + store i32 %add.epil, ptr %arrayidx.epil, align 4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader.new + %indvars.iv = phi i64 [ 0, %for.body.preheader.new ], [ %indvars.iv.next.1, %for.body ] + %niter = phi i64 [ 0, %for.body.preheader.new ], [ %niter.next.1, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + %2 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %2, 4 + store i32 %add, ptr %arrayidx, align 4 + %indvars.iv.next = or i64 %indvars.iv, 1 + %arrayidx.1 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv.next + %3 = load i32, ptr %arrayidx.1, align 4 + %add.1 = add nsw i32 %3, 4 + store i32 %add.1, ptr %arrayidx.1, align 4 + %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2 + %niter.next.1 = add i64 %niter, 2 + %niter.ncmp.1 = icmp eq i64 %niter.next.1, %unroll_iter + br i1 %niter.ncmp.1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body +}