Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17960,6 +17960,12 @@ LoadSDNode *LLD = cast(LHS); LoadSDNode *RLD = cast(RHS); + unsigned RequiredAlignment; + EVT LDVT = LLD->getMemoryVT(); + if (DAG.areNonVolatileConsecutiveLoads(LLD, RLD, LDVT.getStoreSize(), 1) && + TLI.hasPairedLoad(LDVT, RequiredAlignment)) + return false; + // Token chains must be identical. if (LHS.getOperand(0) != RHS.getOperand(0) || // Do not let this transformation reduce the number of volatile loads. Index: lib/Transforms/InstCombine/InstCombineCasts.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCasts.cpp +++ lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1077,6 +1077,19 @@ if (CI.hasOneUse() && isa(CI.user_back())) return nullptr; + // zext(cmp) ~> select cmp, 1, 0 + // Canonicalize into select to give the opportunity of folding a gep + // into the select. This will eventually turn back to zext(cmp) if the + // gep doesn't fold. + if (CI.hasOneUse() && isa(CI.user_back()) && + cast(CI.user_back())->countNonConstantIndices() == 1) { + if (auto *Cmp = dyn_cast(CI.getOperand(0))) { + auto *TV = ConstantInt::get(cast(CI.getType()), 1); + auto *FV = ConstantInt::get(cast(CI.getType()), 0); + return SelectInst::Create(Cmp, TV, FV); + } + } + // If one of the common conversion will work, do it. if (Instruction *Result = commonCastTransforms(CI)) return Result; Index: lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- lib/Transforms/InstCombine/InstCombineInternal.h +++ lib/Transforms/InstCombine/InstCombineInternal.h @@ -810,6 +810,8 @@ /// Canonicalize the position of binops relative to shufflevector. Instruction *foldVectorBinop(BinaryOperator &Inst); + Instruction *FoldGEPIntoSelect(GetElementPtrInst &GEP, SelectInst *SI); + /// Given a binary operator, cast instruction, or select which has a PHI node /// as operand #0, see if we can fold the instruction into the PHI (which is /// only possible if all operands to the PHI are constants). Index: lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -990,6 +990,53 @@ return false; } +// If we are loading from a select of GEPs, unless either of them was going to +// trap anyway, we can safely speculate the load as long as we can statically +// infer that the memory locations from both GEPs are in bounds. +static bool isSafeToSpeculateLoad(Value *TValue, Value *FValue, LoadInst &LI, + const DataLayout &DL) { + assert(LI.isUnordered() && "Illegal speculation of ordered load"); + + if (isa(TValue) && isa(FValue)) { + auto &TGEP = *cast(TValue); + auto &FGEP = *cast(FValue); + + // Returns the index of the different operand between two + // Users as long as the rest of the operands are identical. + // Returns -1 otherwise. + auto HaveIdenticalOperandsExcept = [] (User &U1, User &U2) { + if (U1.getNumOperands() != U2.getNumOperands()) + return -1; + for (unsigned Idx = 0; Idx < U1.getNumOperands(); ++Idx) + if (U1.getOperand(Idx) != U2.getOperand(Idx)) + if (std::equal(U1.op_begin()+Idx+1, U1.op_end(), U2.op_begin()+Idx+1)) + return (int)Idx; + return -1; + }; + + int Idx = HaveIdenticalOperandsExcept(TGEP, FGEP); + // Disregard the pointer operand. + if (Idx > 0) { + auto *C1 = dyn_cast(*(TGEP.op_begin()+Idx)); + auto *C2 = dyn_cast(*(FGEP.op_begin()+Idx)); + if (C1 && C2) { + SmallVector Indices(TGEP.op_begin()+1, TGEP.op_begin()+Idx); + uint64_t Max = std::max(C1->getLimitedValue(), C2->getLimitedValue()); + Type *GepRetTy = GetElementPtrInst::getGEPReturnType( + TGEP.getPointerOperand(), Indices); + Type *ElemTy = GepRetTy->getPointerElementType(); + if ((ElemTy->isArrayTy() && ElemTy->getArrayNumElements() > Max) || + (ElemTy->isStructTy() && ElemTy->getStructNumElements() > Max)) + return true; + } + } + } + + // Fall-through to the conservative speculation logic. + return isSafeToLoadUnconditionally(TValue, LI.getAlignment(), DL, &LI) && + isSafeToLoadUnconditionally(FValue, LI.getAlignment(), DL, &LI); +} + Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { Value *Op = LI.getOperand(0); @@ -1065,8 +1112,7 @@ if (SelectInst *SI = dyn_cast(Op)) { // load (select (Cond, &V1, &V2)) --> select(Cond, load &V1, load &V2). unsigned Align = LI.getAlignment(); - if (isSafeToLoadUnconditionally(SI->getOperand(1), Align, DL, SI) && - isSafeToLoadUnconditionally(SI->getOperand(2), Align, DL, SI)) { + if (isSafeToSpeculateLoad(SI->getOperand(1), SI->getOperand(2), LI, DL)) { LoadInst *V1 = Builder.CreateLoad(SI->getOperand(1), SI->getOperand(1)->getName()+".val"); LoadInst *V2 = Builder.CreateLoad(SI->getOperand(2), Index: lib/Transforms/InstCombine/InstCombineSelect.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineSelect.cpp +++ lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1568,6 +1568,13 @@ } } + // Let the GEP fold into the Select if possible. We can still optimize + // it later if the GEP doesn't fold. + if (SI.hasOneUse() && isa(SI.user_back()) && + cast(SI.user_back())->countNonConstantIndices() == 1 && + isa(TrueVal) && isa(FalseVal)) + return nullptr; + if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal, SQ.getWithInstruction(&SI))) return replaceInstUsesWith(SI, V); Index: lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- lib/Transforms/InstCombine/InstructionCombining.cpp +++ lib/Transforms/InstCombine/InstructionCombining.cpp @@ -781,6 +781,31 @@ return nullptr; } +Instruction *InstCombiner::FoldGEPIntoSelect(GetElementPtrInst &GEP, + SelectInst *SI) { + assert(GEP.hasOneUse() && isa(GEP.user_back()) && + "GEP user is not a Load"); + assert(cast(&GEP)->countNonConstantIndices() == 1 && + "GEP has multiple non-constant indices"); + + if (!SI->hasOneUse()) + return nullptr; + + Value *TV = SI->getTrueValue(); + Value *FV = SI->getFalseValue(); + + if (!isa(TV) || !isa(FV)) + return nullptr; + + Instruction *NewTV = GEP.clone(); + Instruction *NewFV = GEP.clone(); + NewTV->replaceUsesOfWith(SI, TV); + NewFV->replaceUsesOfWith(SI, FV); + InsertNewInstBefore(NewTV, GEP); + InsertNewInstBefore(NewFV, GEP); + return SelectInst::Create(SI->getCondition(), NewTV, NewFV); +} + static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO, InstCombiner::BuilderTy &Builder) { if (auto *Cast = dyn_cast(&I)) @@ -1688,6 +1713,17 @@ PtrOp = NewGEP; } + // Before combining indices, try to fold the GEP into a Select as + // long as its only user is a load. This can allow futher folding + // of the load into the Select. + if (GEP.hasOneUse() && isa(GEP.user_back()) && + cast(&GEP)->countNonConstantIndices() == 1) { + for (auto &Op : GEP.operands()) + if (auto *SI = dyn_cast(Op)) + if (Instruction *NewSel = FoldGEPIntoSelect(GEP, SI)) + return NewSel; + } + // Combine Indices - If the source pointer to this getelementptr instruction // is a getelementptr instruction, combine the indices of the two // getelementptr instructions into a single instruction. Index: test/CodeGen/AArch64/select-load.ll =================================================================== --- test/CodeGen/AArch64/select-load.ll +++ test/CodeGen/AArch64/select-load.ll @@ -17,12 +17,10 @@ ; CHECK-NEXT: mov w19, w1 ; CHECK-NEXT: mov x20, x0 ; CHECK-NEXT: bl foo -; CHECK-NEXT: add x8, x20, #8 // =8 -; CHECK-NEXT: add x9, x20, #4 // =4 +; CHECK-NEXT: ldp w8, w9, [x20, #4] ; CHECK-NEXT: cmp w0, w19 -; CHECK-NEXT: csel x8, x8, x9, gt -; CHECK-NEXT: ldr w0, [x8] ; CHECK-NEXT: ldp x19, x30, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: csel w0, w9, w8, gt ; CHECK-NEXT: ldr x20, [sp], #32 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: Index: test/Transforms/InstCombine/gep-select.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/gep-select.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-arm" + +%struct.B = type { [4 x %struct.A], i32 } +%struct.A = type { [2 x i32], i32, i32 } + +declare i32 @val() + +; It's safe to speculatively execute load(select c, gep1, gep2) if the geps +; are almost identical, having only one different index, which is statically +; known to be in bounds. +define i32 @test1(%struct.B* %b, i64 %idx1, i64 %idx2, i1 %cmp) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_B:%.*]], %struct.B* [[B:%.*]], i64 [[IDX1:%.*]], i32 0, i64 [[IDX2:%.*]], i32 1 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds [[STRUCT_B]], %struct.B* [[B]], i64 [[IDX1]], i32 0, i64 [[IDX2]], i32 2 +; CHECK-NEXT: [[GEP1_VAL:%.*]] = load i32, i32* [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2_VAL:%.*]] = load i32, i32* [[GEP2]], align 4 +; CHECK-NEXT: [[LD:%.*]] = select i1 [[CMP:%.*]], i32 [[GEP1_VAL]], i32 [[GEP2_VAL]] +; CHECK-NEXT: ret i32 [[LD]] +; +entry: + %gep1 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx1, i32 0, i64 %idx2, i32 1 + %gep2 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx1, i32 0, i64 %idx2, i32 2 + %sel = select i1 %cmp, i32* %gep1, i32* %gep2 + %ld = load i32, i32* %sel, align 4 + ret i32 %ld +} + +; Can't trivially speculate load(select c , gep1, gep2) since +; the geps look quite different. +define i32 @test2(%struct.B* %b, i64 %idx1, i64 %idx2, i1 %cmp) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_B:%.*]], %struct.B* [[B:%.*]], i64 [[IDX1:%.*]], i32 0, i64 [[IDX2:%.*]], i32 1 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds [[STRUCT_B]], %struct.B* [[B]], i64 [[IDX1]], i32 1 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32* [[GEP1]], i32* [[GEP2]] +; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[SEL]], align 4 +; CHECK-NEXT: ret i32 [[LD]] +; +entry: + %gep1 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx1, i32 0, i64 %idx2, i32 1 + %gep2 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx1, i32 1 + %sel = select i1 %cmp, i32* %gep1, i32* %gep2 + %ld = load i32, i32* %sel, align 4 + ret i32 %ld +} + +; Can't speculate load(select c , gep1, gep2) since we can +; statically infer that gep1 goes out of bounds. +define i32 @test3(%struct.B* %b, i64 %idx1, i64 %idx2, i1 %cmp) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_B:%.*]], %struct.B* [[B:%.*]], i64 [[IDX1:%.*]], i32 0, i64 [[IDX2:%.*]], i32 0, i64 2 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds [[STRUCT_B]], %struct.B* [[B]], i64 [[IDX1]], i32 0, i64 [[IDX2]], i32 0, i64 1 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32* [[GEP1]], i32* [[GEP2]] +; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[SEL]], align 4 +; CHECK-NEXT: ret i32 [[LD]] +; +entry: + %gep1 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx1, i32 0, i64 %idx2, i32 0, i64 2 + %gep2 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx1, i32 0, i64 %idx2, i32 0, i64 1 + %sel = select i1 %cmp, i32* %gep1, i32* %gep2 + %ld = load i32, i32* %sel, align 4 + ret i32 %ld +} + +; zext(cmp) ~> select cmp, 1, 0 +; gep(select cmp, 1, 0) ~> select cmp, (gep 1), (gep 0) +; load(select cmp, (gep 1), (gep 0)) ~> select cmp, load(gep 1), load(gep 0) +define i32 @test4(%struct.B* %b, i32 %x, i32 %y, i64 %idx) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @val() +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[CALL]], [[X:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[CALL]], [[Y:%.*]] +; CHECK-NEXT: [[IDX1:%.*]] = zext i1 [[CMP1]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_B:%.*]], %struct.B* [[B:%.*]], i64 [[IDX:%.*]], i32 0, i64 [[IDX1]], i32 0, i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_B]], %struct.B* [[B]], i64 [[IDX]], i32 0, i64 [[IDX1]], i32 0, i64 0 +; CHECK-NEXT: [[DOTVAL:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[DOTVAL1:%.*]] = load i32, i32* [[TMP1]], align 4 +; CHECK-NEXT: [[LD:%.*]] = select i1 [[CMP2]], i32 [[DOTVAL]], i32 [[DOTVAL1]] +; CHECK-NEXT: ret i32 [[LD]] +; +entry: + %call = call i32 @val() + %cmp1 = icmp sgt i32 %call, %x + %cmp2 = icmp sgt i32 %call, %y + %idx1 = zext i1 %cmp1 to i64 + %idx2 = zext i1 %cmp2 to i64 + %gep1 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx + %a = getelementptr inbounds %struct.B, %struct.B* %gep1, i32 0, i32 0 + %gep2 = getelementptr inbounds [4 x %struct.A], [4 x %struct.A]* %a, i64 0, i64 %idx1 + %c = getelementptr inbounds %struct.A, %struct.A* %gep2, i32 0, i32 0 + %gep3 = getelementptr inbounds [2 x i32], [2 x i32]* %c, i64 0, i64 %idx2 + %ld = load i32, i32* %gep3, align 4 + ret i32 %ld +}