Index: lib/Target/ARM/ARMCodeGenPrepare.cpp =================================================================== --- lib/Target/ARM/ARMCodeGenPrepare.cpp +++ lib/Target/ARM/ARMCodeGenPrepare.cpp @@ -247,41 +247,108 @@ if (isa(I) && I->hasNoUnsignedWrap()) return true; + // We can support a, potentially, overflowing instruction (I) if: + // - It is only used by an unsigned icmp. + // - The icmp uses a constant. + // - The overflowing value (I) is decreasing, i.e would underflow - wrapping + // around zero to become a larger number than before. + // - The underflowing instruction (I) also uses a constant. + // + // We can then use the two constants to calculate whether the result would + // wrap in respect to itself in the original bitwidth. If it doesn't wrap, + // just underflows the range, the icmp would give the same result whether the + // result has been truncated or not. We calculate this by: + // - Zero extending both constants, if needed, to 32-bits. + // - Take the absolute value of I's constant, adding this to the icmp const. + // - Check that this value is not out of range for small type. If it is, it + // means that it has underflowed enough to wrap around the icmp constant. + // + // For example: + // + // %sub = sub i8 %a, 2 + // %cmp = icmp ule i8 %sub, 254 + // + // If %a = 0, %sub = -2 == FE == 254 + // But if this is evalulated as a i32 + // %sub = -2 == FF FF FF FE == 4294967294 + // So the unsigned compares (i8 and i32) would not yield the same result. + // + // Whereas: + // + // %sub i8 %a, 1 + // %cmp = icmp ule i8 %sub, 254 + // + // If %a = 0, %sub = -1 == FF == 255 + // As i32: + // %sub = -1 == FF FF FF FF == 4294967295 + // + // In this case, the unsigned compare results would be the same and this + // would also be true for ult, uge and ugt: + // - (255 < 254) == (0xFFFFFFFF < 254) == false + // - (255 <= 254) == (0xFFFFFFFF <= 254) == false + // - (255 > 254) == (0xFFFFFFFF > 254) == true + // - (255 >= 254) == (0xFFFFFFFF >= 254) == true + // + // To demonstrate why we can't handle increasing values: + // + // %add = add i8 %a, 2 + // %cmp = icmp ult i8 %add, 127 + // + // If %a = 254, %add = 256 == (i8 1) + // As i32: + // %add = 256 + // + // (1 < 127) != (256 < 127) + unsigned Opc = I->getOpcode(); - if (Opc == Instruction::Add || Opc == Instruction::Sub) { - // We don't care if the add or sub could wrap if the value is decreasing - // and is only being used by an unsigned compare. - if (!I->hasOneUse() || - !isa(*I->user_begin()) || - !isa(I->getOperand(1))) - return false; + if (Opc != Instruction::Add && Opc != Instruction::Sub) + return false; - auto *CI = cast(*I->user_begin()); - - // Don't support an icmp that deals with sign bits, including negative - // immediates - if (CI->isSigned()) - return false; + if (!I->hasOneUse() || + !isa(*I->user_begin()) || + !isa(I->getOperand(1))) + return false; - if (auto *Const = dyn_cast(CI->getOperand(0))) - if (Const->isNegative()) - return false; + ConstantInt *OverflowConst = cast(I->getOperand(1)); + bool NegImm = OverflowConst->isNegative(); + bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) || + ((Opc == Instruction::Add) && NegImm); + if (!IsDecreasing) + return false; - if (auto *Const = dyn_cast(CI->getOperand(1))) - if (Const->isNegative()) - return false; + // Don't support an icmp that deals with sign bits. + auto *CI = cast(*I->user_begin()); + if (CI->isSigned() || CI->isEquality()) + return false; - bool NegImm = cast(I->getOperand(1))->isNegative(); - bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) || - ((Opc == Instruction::Add) && NegImm); - if (!IsDecreasing) - return false; + ConstantInt *ICmpConst = nullptr; + if (auto *Const = dyn_cast(CI->getOperand(0))) + ICmpConst = Const; + else if (auto *Const = dyn_cast(CI->getOperand(1))) + ICmpConst = Const; + else + return false; - LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n"); - return true; - } + // Now check that the result can't wrap on itself. + APInt Total = ICmpConst->getValue().getBitWidth() < 32 ? + ICmpConst->getValue().zext(32) : ICmpConst->getValue(); - return false; + Total += OverflowConst->getValue().getBitWidth() < 32 ? + OverflowConst->getValue().abs().zext(32) : OverflowConst->getValue().abs(); + + APInt Max = APInt::getAllOnesValue(ARMCodeGenPrepare::TypeSize); + + if (Total.getBitWidth() > Max.getBitWidth()) { + if (Total.ugt(Max.zext(Total.getBitWidth()))) + return false; + } else if (Max.getBitWidth() > Total.getBitWidth()) { + if (Total.zext(Max.getBitWidth()).ugt(Max)) + return false; + } else if (Total.ugt(Max)) + return false; + + LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n"); + return true; } static bool shouldPromote(Value *V) { @@ -459,6 +526,8 @@ if (!shouldPromote(V) || isPromotedResultSafe(V)) continue; + assert(EnableDSP && "DSP intrinisc insertion not enabled!"); + // Replace unsafe instructions with appropriate intrinsic calls. InsertDSPIntrinsic(cast(V)); } Index: test/CodeGen/ARM/arm-cgp-overflow.ll =================================================================== --- test/CodeGen/ARM/arm-cgp-overflow.ll +++ test/CodeGen/ARM/arm-cgp-overflow.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 %s -arm-disable-cgp=false -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 %s -arm-disable-cgp=false -o - | FileCheck %s ; CHECK: overflow_add ; CHECK: add @@ -47,3 +47,134 @@ %res = select i1 %cmp, i16 2, i16 5 ret i16 %res } + +; CHECK-LABEL: overflow_add_no_consts: +; CHECK: add r0, r1 +; CHECK: uxtb [[EXT:r[0-9]+]], r0 +; CHECK: cmp [[EXT]], r2 +; CHECK: movhi r0, #8 +define i32 @overflow_add_no_consts(i8 zeroext %a, i8 zeroext %b, i8 zeroext %limit) { + %add = add i8 %a, %b + %cmp = icmp ugt i8 %add, %limit + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +; CHECK-LABEL: overflow_add_const_limit: +; CHECK: add r0, r1 +; CHECK: uxtb [[EXT:r[0-9]+]], r0 +; CHECK: cmp [[EXT]], #128 +; CHECK: movhi r0, #8 +define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) { + %add = add i8 %a, %b + %cmp = icmp ugt i8 %add, 128 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +; CHECK-LABEL: overflow_add_positive_const_limit: +; CHECK: adds r0, #1 +; CHECK: uxtb [[EXT:r[0-9]+]], r0 +; CHECK: cmp [[EXT]], #128 +; CHECK: movhi r0, #8 +define i32 @overflow_add_positive_const_limit(i8 zeroext %a) { + %add = add i8 %a, 1 + %cmp = icmp ugt i8 %add, 128 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +; CHECK-LABEL: unsafe_add_underflow: +; CHECK: subs r0, #2 +; CHECK: uxtb [[EXT:r[0-9]+]], r0 +; CHECK: cmp [[EXT]], #255 +; CHECK: moveq r0, #8 +define i32 @unsafe_add_underflow(i8 zeroext %a) { + %add = add i8 %a, -2 + %cmp = icmp ugt i8 %add, 254 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +; CHECK-LABEL: safe_add_underflow: +; CHECK: subs [[MINUS_1:r[0-9]+]], r0, #1 +; CHECK-NOT: uxtb +; CHECK: cmp [[MINUS_1]], #254 +; CHECK: movhi r0, #8 +define i32 @safe_add_underflow(i8 zeroext %a) { + %add = add i8 %a, -1 + %cmp = icmp ugt i8 %add, 254 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +; CHECK-LABEL: safe_add_underflow_neg: +; CHECK: subs [[MINUS_1:r[0-9]+]], r0, #2 +; CHECK-NOT: uxtb +; CHECK: cmp [[MINUS_1]], #251 +; CHECK: movlo r0, #8 +define i32 @safe_add_underflow_neg(i8 zeroext %a) { + %add = add i8 %a, -2 + %cmp = icmp ule i8 %add, -6 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +; CHECK-LABEL: overflow_sub_negative_const_limit: +; CHECK: adds r0, #1 +; CHECK: uxtb [[EXT:r[0-9]+]], r0 +; CHECK: cmp [[EXT]], #128 +; CHECK: movhi r0, #8 +define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) { + %sub = sub i8 %a, -1 + %cmp = icmp ugt i8 %sub, 128 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +; CHECK-LABEL: unsafe_sub_underflow: +; CHECK: subs r0, #6 +; CHECK: uxtb [[EXT:r[0-9]+]], r0 +; CHECK: cmp [[EXT]], #250 +; CHECK: movhi r0, #8 +define i32 @unsafe_sub_underflow(i8 zeroext %a) { + %sub = sub i8 %a, 6 + %cmp = icmp ugt i8 %sub, 250 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +; CHECK-LABEL: safe_sub_underflow: +; CHECK: subs [[MINUS_1:r[0-9]+]], r0, #1 +; CHECK-NOT: uxtb +; CHECK: cmp [[MINUS_1]], #255 +; CHECK: movlo r0, #8 +define i32 @safe_sub_underflow(i8 zeroext %a) { + %sub = sub i8 %a, 1 + %cmp = icmp ule i8 %sub, 254 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +; CHECK-LABEL: safe_sub_underflow_neg +; CHECK: subs [[MINUS_1:r[0-9]+]], r0, #4 +; CHECK-NOT: uxtb +; CHECK: cmp [[MINUS_1]], #250 +; CHECK: movhi r0, #8 +define i32 @safe_sub_underflow_neg(i8 zeroext %a) { + %sub = sub i8 %a, 4 + %cmp = icmp uge i8 %sub, -5 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +; CHECK: subs r0, #4 +; CHECK: uxtb [[EXT:r[0-9]+]], r0 +; CHECK: cmp [[EXT]], #253 +; CHECK: movlo r0, #8 +define i32 @unsafe_sub_underflow_neg(i8 zeroext %a) { + %sub = sub i8 %a, 4 + %cmp = icmp ult i8 %sub, -3 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} Index: test/CodeGen/ARM/arm-cgp-signed-icmps.ll =================================================================== --- test/CodeGen/ARM/arm-cgp-signed-icmps.ll +++ test/CodeGen/ARM/arm-cgp-signed-icmps.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 -arm-disable-cgp=false -mattr=-use-misched %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP +; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 -arm-disable-cgp=false -mattr=-use-misched %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP ; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP ; RUN: llc -mtriple=thumbv8 %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM @@ -47,13 +47,22 @@ ; CHECK-NODSP: cmp ; CHECK-NODSP: cmp -; CHECK-DSP: sxth [[ARG:r[0-9]+]], r2 -; CHECK-DSP: subs [[SUB:r[0-9]+]], -; CHECK-DSP: uadd16 [[ADD:r[0-9]+]], -; CHECK-DSP: sxth.w [[SEXT:r[0-9]+]], [[ADD]] -; CHECK-DSP: cmp [[SEXT]], [[ARG]] -; CHECK-DSP-NOT: uxt -; CHECK-DSP: cmp [[SUB]], r2 +; CHECK-DSP: sub +; CHECK-DSP: sxth +; CHECK-DSP: add +; CHECK-DSP: uxth +; CHECK-DSP: sxth +; CHECK-DSP: cmp +; CHECK-DSP: cmp + +; CHECK-DSP-IMM: sxth [[ARG:r[0-9]+]], r2 +; CHECK-DSP-IMM: uadd16 [[ADD:r[0-9]+]], +; CHECK-DSP-IMM: sxth.w [[SEXT:r[0-9]+]], [[ADD]] +; CHECK-DSP-IMM: cmp [[SEXT]], [[ARG]] +; CHECK-DSP-IMM-NOT: uxt +; CHECK-DSP-IMM: movs [[ONE:r[0-9]+]], #1 +; CHECK-DSP-IMM: usub16 [[SUB:r[0-9]+]], r1, [[ONE]] +; CHECK-DSP-IMM: cmp [[SUB]], r2 define i16 @ugt_slt(i16 *%x, i16 zeroext %y, i16 zeroext %z) { entry: %load0 = load i16, i16* %x, align 1 Index: test/CodeGen/ARM/pr39060.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/pr39060.ll @@ -0,0 +1,33 @@ +; RUN: llc -mtriple=armv7a-linux-androideabi %s -o - | FileCheck %s + +@a = local_unnamed_addr global i16 -1, align 2 +@b = local_unnamed_addr global i16 0, align 2 + +; CHECK-LABEL: pr39060: +; CHECK: ldrh +; CHECK: ldrh +; CHECK: sub +; CHECK: uxth +define void @pr39060() local_unnamed_addr #0 { +entry: + %0 = load i16, i16* @a, align 2 + %1 = load i16, i16* @b, align 2 + %sub = add i16 %1, -1 + %cmp = icmp eq i16 %0, %sub + br i1 %cmp, label %if.else, label %if.then + +if.then: + tail call void bitcast (void (...)* @f to void ()*)() #2 + br label %if.end + +if.else: + tail call void bitcast (void (...)* @g to void ()*)() #2 + br label %if.end + +if.end: + ret void +} + +declare void @f(...) local_unnamed_addr #1 + +declare void @g(...) local_unnamed_addr #1