Index: lib/Target/ARM/ARMCodeGenPrepare.cpp =================================================================== --- lib/Target/ARM/ARMCodeGenPrepare.cpp +++ lib/Target/ARM/ARMCodeGenPrepare.cpp @@ -85,16 +85,15 @@ const ARMSubtarget *ST = nullptr; IRPromoter *Promoter = nullptr; std::set AllVisited; - Type *OrigTy = nullptr; - unsigned TypeSize = 0; - bool isNarrowInstSupported(Instruction *I); bool isSupportedValue(Value *V); bool isLegalToPromote(Value *V); bool TryToPromote(Value *V); public: static char ID; + static unsigned TypeSize; + Type *OrigTy = nullptr; ARMCodeGenPrepare() : FunctionPass(ID) {} @@ -126,65 +125,66 @@ /// dealing with icmps but allow any other integer that is <= 16 bits. Void /// types are accepted so we can handle switches. static bool isSupportedType(Value *V) { - if (V->getType()->isVoidTy()) + LLVM_DEBUG(dbgs() << "ARM CGP: isSupportedType: " << *V << "\n"); + Type *Ty = V->getType(); + if (Ty->isVoidTy()) return true; - const IntegerType *IntTy = dyn_cast(V->getType()); - if (!IntTy) - return false; + if (auto *Ld = dyn_cast(V)) + Ty = cast(Ld->getPointerOperandType())->getElementType(); - // Don't try to promote boolean values. - if (IntTy->getBitWidth() == 1) + const IntegerType *IntTy = dyn_cast(Ty); + if (!IntTy) { + LLVM_DEBUG(dbgs() << "ARM CGP: No, not an integer.\n"); return false; + } - if (auto *ZExt = dyn_cast(V)) - return isSupportedType(ZExt->getOperand(0)); + return IntTy->getBitWidth() == ARMCodeGenPrepare::TypeSize; +} - return IntTy->getBitWidth() <= 16; +/// Return true if the given value is a leaf in the use-def chain, producing +/// a narrow (i8, i16) value. These values will be zext to start the promotion +/// of the tree to i32. We guarantee that these won't populate the upper bits +/// of the register. ZExt on the loads will be free, and the same for call +/// return values because we only accept ones that guarantee a zeroext ret val. +/// Many arguments will have the zeroext attribute too, so those would be free +/// too. +static bool isSource(Value *V) { + // TODO Allow truncs and zext to be sources. + if (isa(V)) + return true; + else if (isa(V)) + return true; + else if (auto *Call = dyn_cast(V)) + return Call->hasRetAttr(Attribute::AttrKind::ZExt); + return false; } /// Return true if V will require any promoted values to be truncated for the -/// use to be valid. +/// the IR to remain valid. We can't mutate the value type of these +/// instructions. static bool isSink(Value *V) { + // TODO The truncate also isn't actually necessary because we would already + // proved that the data value is kept within the range of the original data + // type. auto UsesNarrowValue = [](Value *V) { - return V->getType()->getScalarSizeInBits() <= 32; + return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize; }; if (auto *Store = dyn_cast(V)) return UsesNarrowValue(Store->getValueOperand()); if (auto *Return = dyn_cast(V)) return UsesNarrowValue(Return->getReturnValue()); + if (auto *Trunc = dyn_cast(V)) + return UsesNarrowValue(Trunc->getOperand(0)); return isa(V); } -/// Return true if the given value is a leaf that will need to be zext'd. -static bool isSource(Value *V) { - if (isa(V) && isSupportedType(V)) - return true; - else if (isa(V)) - return true; - else if (auto *ZExt = dyn_cast(V)) - // ZExt can be a leaf if its the only user of a load. - return isa(ZExt->getOperand(0)) && - ZExt->getOperand(0)->hasOneUse(); - else if (auto *Call = dyn_cast(V)) - return Call->hasRetAttr(Attribute::AttrKind::ZExt); - else if (auto *Load = dyn_cast(V)) { - if (!isa(Load->getType())) - return false; - // A load is a leaf, unless its already just being zext'd. - if (Load->hasOneUse() && isa(*Load->use_begin())) - return false; - - return true; - } - return false; -} - /// Return whether the instruction can be promoted within any modifications to /// it's operands or result. static bool isSafeOverflow(Instruction *I) { + // FIXME Do we need NSW too? if (isa(I) && I->hasNoUnsignedWrap()) return true; @@ -222,19 +222,18 @@ } static bool shouldPromote(Value *V) { - auto *I = dyn_cast(V); - if (!I) + if (!isa(V->getType()) || isSink(V)) return false; - if (!isa(V->getType())) - return false; + if (isSource(V)) + return true; - if (isa(I) || isa(I) || isa(I) || - isa(I)) + auto *I = dyn_cast(V); + if (!I) return false; - if (auto *ZExt = dyn_cast(I)) - return !ZExt->getDestTy()->isIntegerTy(32); + if (isa(I)) + return false; return true; } @@ -262,7 +261,7 @@ /// Return the intrinsic for the instruction that can perform the same /// operation but on a narrow type. This is using the parallel dsp intrinsics /// on scalar values. -static Intrinsic::ID getNarrowIntrinsic(Instruction *I, unsigned TypeSize) { +static Intrinsic::ID getNarrowIntrinsic(Instruction *I) { // Whether we use the signed or unsigned versions of these intrinsics // doesn't matter because we're not using the GE bits that they set in // the APSR. @@ -270,10 +269,10 @@ default: break; case Instruction::Add: - return TypeSize == 16 ? Intrinsic::arm_uadd16 : + return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_uadd16 : Intrinsic::arm_uadd8; case Instruction::Sub: - return TypeSize == 16 ? Intrinsic::arm_usub16 : + return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_usub16 : Intrinsic::arm_usub8; } llvm_unreachable("unhandled opcode for narrow intrinsic"); @@ -285,10 +284,9 @@ SmallPtrSetImpl &Roots) { IRBuilder<> Builder{Ctx}; Type *ExtTy = Type::getInt32Ty(M->getContext()); - unsigned TypeSize = OrigTy->getPrimitiveSizeInBits(); SmallPtrSet Promoted; - LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " << TypeSize - << " to 32-bits\n"); + LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " + << ARMCodeGenPrepare::TypeSize << " to 32-bits\n"); auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) { SmallVector Users; @@ -325,7 +323,7 @@ LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for " << *I << "\n"); Function *DSPInst = - Intrinsic::getDeclaration(M, getNarrowIntrinsic(I, TypeSize)); + Intrinsic::getDeclaration(M, getNarrowIntrinsic(I)); Builder.SetInsertPoint(I); Builder.SetCurrentDebugLocation(I->getDebugLoc()); Value *Args[] = { I->getOperand(0), I->getOperand(1) }; @@ -353,9 +351,7 @@ LLVM_DEBUG(dbgs() << "ARM CGP: Promoting leaves:\n"); for (auto V : Leaves) { LLVM_DEBUG(dbgs() << " - " << *V << "\n"); - if (auto *ZExt = dyn_cast(V)) - ZExt->mutateType(ExtTy); - else if (auto *I = dyn_cast(V)) + if (auto *I = dyn_cast(V)) InsertZExt(I, I); else if (auto *Arg = dyn_cast(V)) { BasicBlock &BB = Arg->getParent()->front(); @@ -401,17 +397,9 @@ for (auto *V : Visited) { if (Leaves.count(V)) continue; - if (auto *ZExt = dyn_cast(V)) { - if (ZExt->getDestTy() != ExtTy) { - ZExt->mutateType(ExtTy); - Promoted.insert(ZExt); - } - else if (ZExt->getSrcTy() == ExtTy) { - ReplaceAllUsersOfWith(V, ZExt->getOperand(0)); - InstsToRemove.push_back(ZExt); - } + + if (!isa(V)) continue; - } if (!shouldPromote(V) || isPromotedResultSafe(V)) continue; @@ -459,30 +447,6 @@ LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete.\n"); } -bool ARMCodeGenPrepare::isNarrowInstSupported(Instruction *I) { - if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I)) - return false; - - if (ST->isThumb() && !ST->hasThumb2()) - return false; - - if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub) - return false; - - // TODO - // Would it be profitable? For Thumb code, these parallel DSP instructions - // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For - // Cortex-A, specifically Cortex-A72, the latency is double and throughput is - // halved. They also do not take immediates as operands. - for (auto &Op : I->operands()) { - if (isa(Op)) { - if (!EnableDSPWithImms) - return false; - } - } - return true; -} - /// We accept most instructions, as well as Arguments and ConstantInsts. We /// Disallow casts other than zext and truncs and only allow calls if their /// return value is zeroext. We don't allow opcodes that can introduce sign @@ -490,42 +454,42 @@ bool ARMCodeGenPrepare::isSupportedValue(Value *V) { LLVM_DEBUG(dbgs() << "ARM CGP: Is " << *V << " supported?\n"); - // Non-instruction values that we can handle. - if (isa(V) || isa(V)) - return true; + if (auto *ICmp = dyn_cast(V)) + return ICmp->isEquality() || !ICmp->isSigned(); // Memory instructions - if (isa(V) || isa(V) || isa(V)) + if (isa(V) || isa(V)) return true; // Branches and targets. - if (auto *ICmp = dyn_cast(V)) - return ICmp->isEquality() || !ICmp->isSigned(); - if( isa(V) || isa(V) || isa(V)) return true; - if (isa(V) || isa(V) || isa(V)) - return true; + // Non-instruction values that we can handle. + if (isa(V) || isa(V)) + return isSupportedType(V); + + if (isa(V) || isa(V) || isa(V) || + isa(V)) + return isSupportedType(V); + + // Currently, Trunc is the only cast we support. + if (auto *Trunc = dyn_cast(V)) + return isSupportedType(Trunc->getOperand(0)); // Special cases for calls as we need to check for zeroext // TODO We should accept calls even if they don't have zeroext, as they can // still be roots. if (auto *Call = dyn_cast(V)) - return Call->hasRetAttr(Attribute::AttrKind::ZExt); - else if (auto *Cast = dyn_cast(V)) { - if (isa(Cast)) - return Cast->getDestTy()->getScalarSizeInBits() <= 32; - else if (auto *Trunc = dyn_cast(V)) - return Trunc->getDestTy()->getScalarSizeInBits() <= TypeSize; - else { - LLVM_DEBUG(dbgs() << "ARM CGP: No, unsupported cast.\n"); - return false; - } - } else if (!isa(V)) { + return isSupportedType(Call) && + Call->hasRetAttr(Attribute::AttrKind::ZExt); + + if (!isa(V)) { LLVM_DEBUG(dbgs() << "ARM CGP: No, not a binary operator.\n"); return false; } + if (!isSupportedType(V)) + return false; bool res = !isSigned(V); if (!res) @@ -537,39 +501,49 @@ /// smaller than the targeted promoted type. Check that we're not trying to /// promote something larger than our base 'TypeSize' type. bool ARMCodeGenPrepare::isLegalToPromote(Value *V) { - if (!isSupportedType(V)) - return false; + if (isPromotedResultSafe(V)) + return true; - unsigned VSize = 0; - if (auto *Ld = dyn_cast(V)) { - auto *PtrTy = cast(Ld->getPointerOperandType()); - VSize = PtrTy->getElementType()->getPrimitiveSizeInBits(); - } else if (auto *ZExt = dyn_cast(V)) { - VSize = ZExt->getOperand(0)->getType()->getPrimitiveSizeInBits(); - } else { - VSize = V->getType()->getPrimitiveSizeInBits(); - } + auto *I = dyn_cast(V); + if (!I) + return false; - if (VSize > TypeSize) + // If promotion is not safe, can we use a DSP instruction to natively + // handle the narrow type? + if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I)) return false; - if (isPromotedResultSafe(V)) - return true; + if (ST->isThumb() && !ST->hasThumb2()) + return false; - if (auto *I = dyn_cast(V)) - return isNarrowInstSupported(I); + if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub) + return false; - return false; + // TODO + // Would it be profitable? For Thumb code, these parallel DSP instructions + // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For + // Cortex-A, specifically Cortex-A72, the latency is double and throughput is + // halved. They also do not take immediates as operands. + for (auto &Op : I->operands()) { + if (isa(Op)) { + if (!EnableDSPWithImms) + return false; + } + } + return true; } bool ARMCodeGenPrepare::TryToPromote(Value *V) { OrigTy = V->getType(); TypeSize = OrigTy->getPrimitiveSizeInBits(); + if (TypeSize > 16) + return false; if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V)) return false; - LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << "\n"); + LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << ", TypeSize = " + << TypeSize << "\n"); SetVector WorkList; SmallPtrSet Leaves; @@ -584,6 +558,10 @@ if (CurrentVisited.count(V)) return true; + // Ignore pointer value that aren't instructions. + if (!isa(V) && isa(V->getType())) + return true; + if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) { LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n"); return false; @@ -638,41 +616,10 @@ } } - unsigned NumToPromote = 0; - unsigned Cost = 0; - for (auto *V : CurrentVisited) { - // Truncs will cause a uxt and no zeroext arguments will often require - // a uxt somewhere. - if (isa(V)) - ++Cost; - else if (auto *Arg = dyn_cast(V)) { - if (!Arg->hasZExtAttr()) - ++Cost; - } - - // Mem ops can automatically be extended/truncated and non-instructions - // don't need anything done. - if (Leaves.count(V) || isa(V) || !isa(V)) - continue; - - // Will need to truncate calls args and returns. - if (Roots.count(cast(V))) { - ++Cost; - continue; - } - - if (shouldPromote(V)) - ++NumToPromote; - } - LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n"; for (auto *I : CurrentVisited) I->dump(); ); - LLVM_DEBUG(dbgs() << "ARM CGP: Cost of promoting " << NumToPromote - << " instructions = " << Cost << "\n"); - if (Cost > NumToPromote || (NumToPromote == 0)) - return false; Promoter->Mutate(OrigTy, CurrentVisited, Leaves, Roots); return true; @@ -712,12 +659,8 @@ LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n"); for (auto &Op : CI.operands()) { - if (auto *I = dyn_cast(Op)) { - if (isa(I)) - MadeChange |= TryToPromote(I->getOperand(0)); - else - MadeChange |= TryToPromote(I); - } + if (auto *I = dyn_cast(Op)) + MadeChange |= TryToPromote(I); } } } @@ -744,6 +687,7 @@ false, false) char ARMCodeGenPrepare::ID = 0; +unsigned ARMCodeGenPrepare::TypeSize = 0; FunctionPass *llvm::createARMCodeGenPreparePass() { return new ARMCodeGenPrepare(); Index: test/CodeGen/ARM/arm-cgp-icmps.ll =================================================================== --- test/CodeGen/ARM/arm-cgp-icmps.ll +++ test/CodeGen/ARM/arm-cgp-icmps.ll @@ -158,39 +158,6 @@ ret i32 %res } -; CHECK-COMMON-LABEL: dsp_imm2 -; CHECK-COMMON: add r0, r1 -; CHECK-DSP-NEXT: ldrh r1, [r3] -; CHECK-DSP-NEXT: ldrh r2, [r2] -; CHECK-DSP-NEXT: subs r1, r1, r0 -; CHECK-DSP-NEXT: add r0, r2 -; CHECK-DSP-NEXT: uxth r3, r1 -; CHECK-DSP-NEXT: uxth r2, r0 -; CHECK-DSP-NEXT: cmp r2, r3 - -; CHECK-DSP-IMM: movs r1, #0 -; CHECK-DSP-IMM-NEXT: uxth r0, r0 -; CHECK-DSP-IMM-NEXT: usub16 r1, r1, r0 -; CHECK-DSP-IMM-NEXT: ldrh r0, [r2] -; CHECK-DSP-IMM-NEXT: ldrh r3, [r3] -; CHECK-DSP-IMM-NEXT: usub16 r0, r0, r1 -; CHECK-DSP-IMM-NEXT: uadd16 r1, r3, r1 -; CHECK-DSP-IMM-NEXT: cmp r0, r1 - -define i16 @dsp_imm2(i32 %arg0, i32 %arg1, i16* %gep0, i16* %gep1) { -entry: - %add0 = add i32 %arg0, %arg1 - %conv0 = trunc i32 %add0 to i16 - %sub0 = sub i16 0, %conv0 - %load0 = load i16, i16* %gep0, align 2 - %load1 = load i16, i16* %gep1, align 2 - %sub1 = sub i16 %load0, %sub0 - %add1 = add i16 %load1, %sub0 - %cmp = icmp ult i16 %sub1, %add1 - %res = select i1 %cmp, i16 %add1, i16 %sub1 - ret i16 %res -} - ; CHECK-COMMON-LABEL: dsp_var: ; CHECK-COMMON: eors r1, r0 ; CHECK-COMMON: and r2, r0, #7 @@ -267,109 +234,6 @@ ret i32 %res } -; CHECK-COMMON-LABEL: icmp_i32_zext: -; CHECK-COMMON: ldrb [[LD:r[^ ]+]], [r0] -; CHECK-COMMON: subs [[SUB:r[^ ]+]], [[LD]], #1 -; CHECK-COMMON-NOT: uxt -; CHECK-COMMON: cmp [[LD]], [[SUB]] -; CHECK-COMMON-NOT: uxt -define i8 @icmp_i32_zext(i8* %ptr) { -entry: - %gep = getelementptr inbounds i8, i8* %ptr, i32 0 - %0 = load i8, i8* %gep, align 1 - %1 = sub nuw nsw i8 %0, 1 - %conv44 = zext i8 %0 to i32 - br label %preheader - -preheader: - br label %body - -body: - %2 = phi i8 [ %1, %preheader ], [ %3, %if.end ] - %si.0274 = phi i32 [ %conv44, %preheader ], [ %inc, %if.end ] - %conv51266 = zext i8 %2 to i32 - %cmp52267 = icmp eq i32 %si.0274, %conv51266 - br i1 %cmp52267, label %if.end, label %exit - -if.end: - %inc = add i32 %si.0274, 1 - %gep1 = getelementptr inbounds i8, i8* %ptr, i32 %inc - %3 = load i8, i8* %gep1, align 1 - br label %body - -exit: - ret i8 %2 -} - -@d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1 -@sh1 = hidden local_unnamed_addr global i16 0, align 2 -@d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2 - -; CHECK-COMMON-LABEL: icmp_sext_zext_store_i8_i16 -; CHECK-NODSP: ldrb [[BYTE:r[^ ]+]], -; CHECK-NODSP: strh [[BYTE]], -; CHECK-NODSP: ldrsh.w -define i32 @icmp_sext_zext_store_i8_i16() { -entry: - %0 = load i8, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @d_uch, i32 0, i32 2), align 1 - %conv = zext i8 %0 to i16 - store i16 %conv, i16* @sh1, align 2 - %conv1 = zext i8 %0 to i32 - %1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @d_sh, i32 0, i32 2), align 2 - %conv2 = sext i16 %1 to i32 - %cmp = icmp eq i32 %conv1, %conv2 - %conv3 = zext i1 %cmp to i32 - ret i32 %conv3 -} - -; CHECK-COMMON-LABEL: or_icmp_ugt: -; CHECK-COMMON: ldrb [[LD:r[^ ]+]], [r1] -; CHECK-COMMON: subs [[SUB:r[^ ]+]], #1 -; CHECK-COMMON-NOT: uxtb -; CHECK-COMMON: cmp [[SUB]], #3 -define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) { -entry: - %0 = load i8, i8* %ptr - %1 = zext i8 %0 to i32 - %mul = shl nuw nsw i32 %1, 1 - %add0 = add nuw nsw i32 %mul, 6 - %cmp0 = icmp ne i32 %arg, %add0 - %add1 = add i8 %0, -1 - %cmp1 = icmp ugt i8 %add1, 3 - %or = or i1 %cmp0, %cmp1 - ret i1 %or -} - -; CHECK-COMMON-LABEL: icmp_switch_trunc: -; CHECK-COMMON-NOT: uxt -define i16 @icmp_switch_trunc(i16 zeroext %arg) { -entry: - %conv = add nuw i16 %arg, 15 - %mul = mul nuw nsw i16 %conv, 3 - %trunc = trunc i16 %arg to i3 - switch i3 %trunc, label %default [ - i3 0, label %sw.bb - i3 1, label %sw.bb.i - ] - -sw.bb: - %cmp0 = icmp ult i16 %mul, 127 - %select = select i1 %cmp0, i16 %mul, i16 127 - br label %exit - -sw.bb.i: - %cmp1 = icmp ugt i16 %mul, 34 - %select.i = select i1 %cmp1, i16 %mul, i16 34 - br label %exit - -default: - br label %exit - -exit: - %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ] - ret i16 %res -} - ; CHECK-COMMON-LABEL: icmp_eq_minus_one ; CHECK-COMMON: cmp r0, #255 define i32 @icmp_eq_minus_one(i8* %ptr) { @@ -392,77 +256,3 @@ ret i32 %res } -; CHECK-COMMON-LABEL: mul_wrap -; CHECK-COMMON: mul -; CHECK-COMMON: uxth -; CHECK-COMMON: cmp -define i16 @mul_wrap(i16 %arg0, i16 %arg1) { - %mul = mul i16 %arg0, %arg1 - %cmp = icmp eq i16 %mul, 1 - %res = select i1 %cmp, i16 %arg0, i16 47 - ret i16 %res -} - -; CHECK-COMMON-LABEL: shl_wrap -; CHECK-COMMON: lsl -; CHECK-COMMON: uxth -; CHECK-COMMON: cmp -define i16 @shl_wrap(i16 %arg0) { - %mul = shl i16 %arg0, 4 - %cmp = icmp eq i16 %mul, 1 - %res = select i1 %cmp, i16 %arg0, i16 47 - ret i16 %res -} - -; CHECK-COMMON-LABEL: add_wrap -; CHECK-COMMON: add -; CHECK-COMMON: uxth -; CHECK-COMMON: cmp -define i16 @add_wrap(i16 %arg0, i16 %arg1) { - %add = add i16 %arg0, 128 - %cmp = icmp eq i16 %add, %arg1 - %res = select i1 %cmp, i16 %arg0, i16 1 - ret i16 %res -} - -; CHECK-COMMON-LABEL: sub_wrap -; CHECK-COMMON: sub -; CHECK-COMMON: uxth -; CHECK-COMMON: cmp -define i16 @sub_wrap(i16 %arg0, i16 %arg1, i16 %arg2) { - %sub = sub i16 %arg0, %arg2 - %cmp = icmp eq i16 %sub, %arg1 - %res = select i1 %cmp, i16 %arg0, i16 1 - ret i16 %res -} - -; CHECK-COMMON-LABEL: urem_trunc_icmps -; CHECK-COMMON-NOT: uxt -define void @urem_trunc_icmps(i16** %in, i32* %g, i32* %k) { -entry: - %ptr = load i16*, i16** %in, align 4 - %ld = load i16, i16* %ptr, align 2 - %cmp.i = icmp eq i16 %ld, 0 - br i1 %cmp.i, label %exit, label %cond.false.i - -cond.false.i: - %rem = urem i16 5, %ld - %extract.t = trunc i16 %rem to i8 - br label %body - -body: - %cond.in.i.off0 = phi i8 [ %extract.t, %cond.false.i ], [ %add, %for.inc ] - %cmp = icmp ugt i8 %cond.in.i.off0, 7 - %conv5 = zext i1 %cmp to i32 - store i32 %conv5, i32* %g, align 4 - %.pr = load i32, i32* %k, align 4 - %tobool13150 = icmp eq i32 %.pr, 0 - br i1 %tobool13150, label %for.inc, label %exit - -for.inc: - %add = add nuw i8 %cond.in.i.off0, 1 - br label %body - -exit: - ret void -} Index: test/CodeGen/ARM/arm-cgp-overflow.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/arm-cgp-overflow.ll @@ -0,0 +1,49 @@ +; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 %s -arm-disable-cgp=false -o - | FileCheck %s + +; CHECK: overflow_add +; CHECK: add +; CHECK: uxth +; CHECK: cmp +define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) { + %add = add i16 %a, %b + %or = or i16 %add, 1 + %cmp = icmp ugt i16 %or, 1024 + %res = select i1 %cmp, i16 2, i16 5 + ret i16 %res +} + +; CHECK-LABEL: overflow_sub +; CHECK: sub +; CHECK: uxth +; CHECK: cmp +define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) { + %add = sub i16 %a, %b + %or = or i16 %add, 1 + %cmp = icmp ugt i16 %or, 1024 + %res = select i1 %cmp, i16 2, i16 5 + ret i16 %res +} + +; CHECK-LABEL: overflow_mul +; CHECK: mul +; CHECK: uxth +; CHECK: cmp +define zeroext i16 @overflow_mul(i16 zeroext %a, i16 zeroext %b) { + %add = mul i16 %a, %b + %or = or i16 %add, 1 + %cmp = icmp ugt i16 %or, 1024 + %res = select i1 %cmp, i16 2, i16 5 + ret i16 %res +} + +; CHECK-LABEL: overflow_shl +; CHECK-COMMON: lsl +; CHECK-COMMON: uxth +; CHECK-COMMON: cmp +define zeroext i16 @overflow_shl(i16 zeroext %a, i16 zeroext %b) { + %add = shl i16 %a, %b + %or = or i16 %add, 1 + %cmp = icmp ugt i16 %or, 1024 + %res = select i1 %cmp, i16 2, i16 5 + ret i16 %res +} Index: test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll =================================================================== --- test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll +++ test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll @@ -116,48 +116,6 @@ ret void } -; CHECK-COMMON-LABEL: phi_feeding_switch -; CHECK-COMMON: ldrb -; CHECK-COMMON: uxtb -; CHECK-COMMON-NOT: uxt -define void @phi_feeding_switch(i8* %memblock, i8* %store, i16 %arg) { -entry: - %pre = load i8, i8* %memblock, align 1 - %conv = trunc i16 %arg to i8 - br label %header - -header: - %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ] - %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ] - %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ] - switch i8 %phi.0, label %default [ - i8 43, label %for.inc.i - i8 45, label %for.inc.i.i - ] - -for.inc.i: - %xor = xor i8 %phi.1, 1 - br label %latch - -for.inc.i.i: - %and = and i8 %phi.1, 3 - br label %latch - -default: - %sub = sub i8 %phi.0, 1 - %cmp2 = icmp ugt i8 %sub, 4 - br i1 %cmp2, label %latch, label %exit - -latch: - %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ] - %count = add nuw i8 %phi.2, 1 - store i8 %count, i8* %store, align 1 - br label %header - -exit: - ret void -} - ; CHECK-COMMON-LABEL: ret_i8 ; CHECK-COMMON-NOT: uxt define i8 @ret_i8() { @@ -186,33 +144,6 @@ ret i8 %inc2 } -; Check that %exp requires uxth in all cases, and will also be required to -; promote %1 for the call - unless we can generate a uadd16. -; CHECK-COMMON-LABEL: zext_load_sink_call: -; CHECK-COMMON: uxt -; CHECK-DSP-IMM: uadd16 -; CHECK-COMMON: cmp -; CHECK-DSP: uxt -; CHECK-DSP-IMM-NOT: uxt -define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) { -entry: - %0 = load i16, i16* %ptr, align 4 - %1 = add i16 %exp, 3 - %cmp = icmp eq i16 %0, %exp - br i1 %cmp, label %exit, label %if.then - -if.then: - %conv0 = zext i16 %0 to i32 - %conv1 = zext i16 %1 to i32 - %call = tail call arm_aapcs_vfpcc i32 @dummy(i32 %conv0, i32 %conv1) - br label %exit - -exit: - %exitval = phi i32 [ %call, %if.then ], [ 0, %entry ] - ret i32 %exitval -} - - ; Check that the pass doesn't try to promote the immediate parameters. ; CHECK-COMMON-LABEL: call_with_imms ; CHECK-COMMON-NOT: uxt @@ -301,9 +232,10 @@ ret i32 undef } +; Transform will bail because of the zext ; Check that d.sroa.0.0.be is promoted passed directly into the tail call. ; CHECK-COMMON-LABEL: check_zext_phi_call_arg -; CHECK-COMMON-NOT: uxt +; CHECK-COMMON: uxt define i32 @check_zext_phi_call_arg() { entry: br label %for.cond @@ -385,7 +317,6 @@ declare dso_local i32 @e(...) local_unnamed_addr #1 declare dso_local zeroext i16 @f(...) local_unnamed_addr #1 -declare i32 @dummy(i32, i32) declare i8 @dummy_i8(i8) declare i8 @dummy2(i8*, i8, i8) declare i16 @dummy3(i16) Index: test/CodeGen/ARM/arm-cgp-zext-truncs.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/arm-cgp-zext-truncs.ll @@ -0,0 +1,292 @@ +; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 %s -arm-disable-cgp=false -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP +; RUN: llc -mtriple=thumbv7-linux-android %s -arm-disable-cgp=false -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP +; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP +; RUN: llc -mtriple=thumbv8 %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM + +; Transform will fail because the trunc is not a sink. +; CHECK-COMMON-LABEL: dsp_trunc +; CHECK-COMMON: add [[ADD:[^ ]+]], +; CHECK-DSP-NEXT: ldrh r1, [r3] +; CHECK-DSP-NEXT: ldrh r2, [r2] +; CHECK-DSP-NEXT: subs r1, r1, [[ADD]] +; CHECK-DSP-NEXT: add r0, r2 +; CHECK-DSP-NEXT: uxth r3, r1 +; CHECK-DSP-NEXT: uxth r2, r0 +; CHECK-DSP-NEXT: cmp r2, r3 + +; With DSP-IMM, we could have: +; movs r1, #0 +; uxth r0, r0 +; usub16 r1, r1, r0 +; ldrh r0, [r2] +; ldrh r3, [r3] +; usub16 r0, r0, r1 +; uadd16 r1, r3, r1 +; cmp r0, r1 +define i16 @dsp_trunc(i32 %arg0, i32 %arg1, i16* %gep0, i16* %gep1) { +entry: + %add0 = add i32 %arg0, %arg1 + %conv0 = trunc i32 %add0 to i16 + %sub0 = sub i16 0, %conv0 + %load0 = load i16, i16* %gep0, align 2 + %load1 = load i16, i16* %gep1, align 2 + %sub1 = sub i16 %load0, %sub0 + %add1 = add i16 %load1, %sub0 + %cmp = icmp ult i16 %sub1, %add1 + %res = select i1 %cmp, i16 %add1, i16 %sub1 + ret i16 %res +} + +; CHECK-COMMON-LABEL: trunc_i16_i8 +; CHECK-COMMON: ldrh +; CHECK-COMMON: uxtb +; CHECK-COMMON: cmp +define i8 @trunc_i16_i8(i16* %ptr, i16 zeroext %arg0, i8 zeroext %arg1) { +entry: + %0 = load i16, i16* %ptr + %1 = add i16 %0, %arg0 + %2 = trunc i16 %1 to i8 + %3 = icmp ugt i8 %2, %arg1 + %4 = select i1 %3, i8 %2, i8 %arg1 + ret i8 %4 +} + +; The pass will bail because of the zext, otherwise we'd want something like: +; ldrb [[LD:r[^ ]+]], [r0] +; subs [[SUB:r[^ ]+]], [[LD]], #1 +; cmp [[LD]], [[SUB]] +; CHECK-COMMON-LABEL: icmp_i32_zext: +; CHECK-COMMON: uxtb +define i8 @icmp_i32_zext(i8* %ptr) { +entry: + %gep = getelementptr inbounds i8, i8* %ptr, i32 0 + %0 = load i8, i8* %gep, align 1 + %1 = sub nuw nsw i8 %0, 1 + %conv44 = zext i8 %0 to i32 + br label %preheader + +preheader: + br label %body + +body: + %2 = phi i8 [ %1, %preheader ], [ %3, %if.end ] + %si.0274 = phi i32 [ %conv44, %preheader ], [ %inc, %if.end ] + %conv51266 = zext i8 %2 to i32 + %cmp52267 = icmp eq i32 %si.0274, %conv51266 + br i1 %cmp52267, label %if.end, label %exit + +if.end: + %inc = add i32 %si.0274, 1 + %gep1 = getelementptr inbounds i8, i8* %ptr, i32 %inc + %3 = load i8, i8* %gep1, align 1 + br label %body + +exit: + ret i8 %2 +} + +; Won't handle zext or sext +; CHECK-COMMON-LABEL: icmp_sext_zext_store_i8_i16 +define i32 @icmp_sext_zext_store_i8_i16() { +entry: + %0 = load i8, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @d_uch, i32 0, i32 2), align 1 + %conv = zext i8 %0 to i16 + store i16 %conv, i16* @sh1, align 2 + %conv1 = zext i8 %0 to i32 + %1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @d_sh, i32 0, i32 2), align 2 + %conv2 = sext i16 %1 to i32 + %cmp = icmp eq i32 %conv1, %conv2 + %conv3 = zext i1 %cmp to i32 + ret i32 %conv3 +} + +; Pass will bail because of the zext, otherwise: +; ldrb [[LD:r[^ ]+]], [r1] +; subs [[SUB:r[^ ]+]], #1 +; cmp [[SUB]], #3 +; CHECK-COMMON-LABEL: or_icmp_ugt: +; CHECK-COMMON: uxt +define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) { +entry: + %0 = load i8, i8* %ptr + %1 = zext i8 %0 to i32 + %mul = shl nuw nsw i32 %1, 1 + %add0 = add nuw nsw i32 %mul, 6 + %cmp0 = icmp ne i32 %arg, %add0 + %add1 = add i8 %0, -1 + %cmp1 = icmp ugt i8 %add1, 3 + %or = or i1 %cmp0, %cmp1 + ret i1 %or +} + +; CHECK-COMMON-LABEL: icmp_switch_trunc: +; CHECK-COMMON-NOT: uxt +define i16 @icmp_switch_trunc(i16 zeroext %arg) { +entry: + %conv = add nuw i16 %arg, 15 + %mul = mul nuw nsw i16 %conv, 3 + %trunc = trunc i16 %arg to i3 + switch i3 %trunc, label %default [ + i3 0, label %sw.bb + i3 1, label %sw.bb.i + ] + +sw.bb: + %cmp0 = icmp ult i16 %mul, 127 + %select = select i1 %cmp0, i16 %mul, i16 127 + br label %exit + +sw.bb.i: + %cmp1 = icmp ugt i16 %mul, 34 + %select.i = select i1 %cmp1, i16 %mul, i16 34 + br label %exit + +default: + br label %exit + +exit: + %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ] + ret i16 %res +} + +; Pass will bail because of the zext +; CHECK-COMMON-LABEL: urem_trunc_icmps +; CHECK-COMMON: uxt +define void @urem_trunc_icmps(i16** %in, i32* %g, i32* %k) { +entry: + %ptr = load i16*, i16** %in, align 4 + %ld = load i16, i16* %ptr, align 2 + %cmp.i = icmp eq i16 %ld, 0 + br i1 %cmp.i, label %exit, label %cond.false.i + +cond.false.i: + %rem = urem i16 5, %ld + %extract.t = trunc i16 %rem to i8 + br label %body + +body: + %cond.in.i.off0 = phi i8 [ %extract.t, %cond.false.i ], [ %add, %for.inc ] + %cmp = icmp ugt i8 %cond.in.i.off0, 7 + %conv5 = zext i1 %cmp to i32 + store i32 %conv5, i32* %g, align 4 + %.pr = load i32, i32* %k, align 4 + %tobool13150 = icmp eq i32 %.pr, 0 + br i1 %tobool13150, label %for.inc, label %exit + +for.inc: + %add = add nuw i8 %cond.in.i.off0, 1 + br label %body + +exit: + ret void +} + +; CHECK-COMMON-LABEL: phi_feeding_switch +; CHECK-COMMON: ldrb +; CHECK-COMMON: uxtb +; CHECK-COMMON: uxtb +define void @phi_feeding_switch(i8* %memblock, i8* %store, i16 %arg) { +entry: + %pre = load i8, i8* %memblock, align 1 + %conv = trunc i16 %arg to i8 + br label %header + +header: + %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ] + %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ] + %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ] + switch i8 %phi.0, label %default [ + i8 43, label %for.inc.i + i8 45, label %for.inc.i.i + ] + +for.inc.i: + %xor = xor i8 %phi.1, 1 + br label %latch + +for.inc.i.i: + %and = and i8 %phi.1, 3 + br label %latch + +default: + %sub = sub i8 %phi.0, 1 + %cmp2 = icmp ugt i8 %sub, 4 + br i1 %cmp2, label %latch, label %exit + +latch: + %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ] + %count = add nuw i8 %phi.2, 1 + store i8 %count, i8* %store, align 1 + br label %header + +exit: + ret void +} + +; Again, zexts will prevent the transform. +; Check that %exp requires uxth in all cases, and will also be required to +; promote %1 for the call - unless we can generate a uadd16. +; CHECK-COMMON-LABEL: zext_load_sink_call: +; CHECK-COMMON: uxt +; uadd16 +; cmp +; CHECK-COMMON: uxt +define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) { +entry: + %0 = load i16, i16* %ptr, align 4 + %1 = add i16 %exp, 3 + %cmp = icmp eq i16 %0, %exp + br i1 %cmp, label %exit, label %if.then + +if.then: + %conv0 = zext i16 %0 to i32 + %conv1 = zext i16 %1 to i32 + %call = tail call arm_aapcs_vfpcc i32 @dummy(i32 %conv0, i32 %conv1) + br label %exit + +exit: + %exitval = phi i32 [ %call, %if.then ], [ 0, %entry ] + ret i32 %exitval +} + +%class.ae = type { i8 } +%class.x = type { i8 } +%class.v = type { %class.q } +%class.q = type { i16 } + +; CHECK-COMMON-LABEL: trunc_i16_i9_switch +; CHECK-COMMON-NOT: uxt +define i32 @trunc_i16_i9_switch(%class.ae* %this) { +entry: + %call = tail call %class.x* @_ZNK2ae2afEv(%class.ae* %this) + %call2 = tail call %class.v* @_ZN1x2acEv(%class.x* %call) + %0 = getelementptr inbounds %class.v, %class.v* %call2, i32 0, i32 0, i32 0 + %1 = load i16, i16* %0, align 2 + %2 = trunc i16 %1 to i9 + %trunc = and i9 %2, -64 + switch i9 %trunc, label %cleanup.fold.split [ + i9 0, label %cleanup + i9 -256, label %if.then7 + ] + +if.then7: + %3 = and i16 %1, 7 + %tobool = icmp eq i16 %3, 0 + %cond = select i1 %tobool, i32 2, i32 1 + br label %cleanup + +cleanup.fold.split: + br label %cleanup + +cleanup: + %retval.0 = phi i32 [ %cond, %if.then7 ], [ 0, %entry ], [ 2, %cleanup.fold.split ] + ret i32 %retval.0 +} + +declare %class.x* @_ZNK2ae2afEv(%class.ae*) local_unnamed_addr +declare %class.v* @_ZN1x2acEv(%class.x*) local_unnamed_addr +declare i32 @dummy(i32, i32) + +@d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1 +@sh1 = hidden local_unnamed_addr global i16 0, align 2 +@d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2