Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -965,6 +965,7 @@ bool IsPairwiseForm) const; int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned) const; + Optional getLoadCombineCost(Value *V) const; /// \returns The cost of Intrinsic instructions. Analyses the real arguments. /// Three cases are handled: 1. scalar instruction 2. vector instruction @@ -1293,6 +1294,7 @@ bool IsPairwiseForm) = 0; virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned) = 0; + virtual Optional getLoadCombineCost(Value *V) = 0; virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys, FastMathFlags FMF, unsigned ScalarizationCostPassed) = 0; @@ -1690,6 +1692,9 @@ bool IsPairwiseForm, bool IsUnsigned) override { return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned); } + Optional getLoadCombineCost(Value *V) override { + return Impl.getLoadCombineCost(V); + } int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys, FastMathFlags FMF, unsigned ScalarizationCostPassed) override { return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF, Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -487,6 +487,8 @@ unsigned getMinMaxReductionCost(Type *, Type *, bool, bool) { return 1; } + Optional getLoadCombineCost(Value *V) { return llvm::None; } + unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) { return 0; } bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) { Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -39,6 +39,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/MC/MCSchedule.h" @@ -61,6 +62,7 @@ class ScalarEvolution; class SCEV; class TargetMachine; +using namespace llvm::PatternMatch; extern cl::opt PartialUnrollingThreshold; @@ -1687,6 +1689,50 @@ ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } + /// Given an input value that is an element of an 'or' reduction, check if the + /// reduction is composed of narrower loaded values. Assuming that a + /// legal-sized reduction of shifted/zexted loaded values can be load combined + /// in the backend, create a relative cost that accounts for the removal of + /// the intermediate ops and replacement by a single wide load. + /// TODO: If load combining is allowed in the IR optimizer, this analysis + /// may not be necessary. + llvm::Optional getLoadCombineCost(Value *V) { + // Look past the reduction to find a source value. Arbitrarily follow the + // path through operand 0 of any 'or'. Also, peek through optional + // shift-left-by-constant. + Value *ZextLoad = V; + while (match(ZextLoad, m_Or(m_Value(), m_Value())) || + match(ZextLoad, m_Shl(m_Value(), m_Constant()))) + ZextLoad = cast(ZextLoad)->getOperand(0); + + // Check if the input to the reduction is an extended load. + Value *LoadPtr; + if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) + return llvm::None; + + // Require that the total load bit width is a legal integer type. + // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. + // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. + Type *WideType = V->getType(); + Type *EltType = LoadPtr->getType()->getPointerElementType(); + unsigned WideWidth = WideType->getIntegerBitWidth(); + unsigned EltWidth = EltType->getIntegerBitWidth(); + if (!isTypeLegal(WideType) || WideWidth % EltWidth != 0) + return llvm::None; + + // Calculate relative cost: {narrow load+zext+shl+or} are assumed to be + // removed for each element and replaced by a single wide load. + auto *ConcreteTTI = static_cast(this); + int Cost = 0; + Cost -= ConcreteTTI->getMemoryOpCost(Instruction::Load, EltType, 0, 0); + Cost -= ConcreteTTI->getCastInstrCost(Instruction::ZExt, WideType, EltType); + Cost -= ConcreteTTI->getArithmeticInstrCost(Instruction::Shl, WideType); + Cost -= ConcreteTTI->getArithmeticInstrCost(Instruction::Or, WideType); + Cost *= (WideWidth / EltWidth); // Scale by number of elements. + Cost += ConcreteTTI->getMemoryOpCost(Instruction::Load, WideType, 0, 0); + return Cost; + } + unsigned getVectorSplitCost() { return 1; } /// @} Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -726,6 +726,10 @@ return Cost; } +Optional TargetTransformInfo::getLoadCombineCost(Value *V) const { + return TTIImpl->getLoadCombineCost(V); +} + unsigned TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef Tys) const { return TTIImpl->getCostOfKeepingLiveOverCall(Tys); Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6498,10 +6498,18 @@ int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; int ScalarReduxCost = 0; + Optional LoadCombineCost; switch (ReductionData.getKind()) { case RK_Arithmetic: - ScalarReduxCost = - TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy); + if (ReductionData.getOpcode() == Instruction::Or) + LoadCombineCost = TTI->getLoadCombineCost(FirstReducedVal); + if (LoadCombineCost) { + LLVM_DEBUG(dbgs() << "SLP: Assume load combine for scalar reduction of " + << *(cast(ReductionRoot)) << "\n"); + } else { + ScalarReduxCost = + TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy); + } break; case RK_Min: case RK_Max: @@ -6515,7 +6523,10 @@ case RK_None: llvm_unreachable("Expected arithmetic or min/max reduction operation"); } - ScalarReduxCost *= (ReduxWidth - 1); + if (LoadCombineCost) + ScalarReduxCost = LoadCombineCost.getValue(); + else + ScalarReduxCost *= (ReduxWidth - 1); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost << " for reduction that starts with " << *FirstReducedVal Index: llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll +++ llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll @@ -15,31 +15,37 @@ ; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5 ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G0]] to <4 x i8>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]] +; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]] +; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]] +; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]] ; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]] ; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]] ; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]] ; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]] -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64 +; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64 +; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64 +; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64 ; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64 ; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64 ; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64 ; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <4 x i64> [[TMP3]], +; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56 +; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48 +; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40 +; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32 ; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24 ; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16 ; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8 -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP5]], [[SH4]] -; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], [[SH5]] -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], [[SH6]] -; CHECK-NEXT: [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z7]] -; CHECK-NEXT: ret i64 [[OP_EXTRA]] +; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]] +; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]] +; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]] +; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]] +; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]] +; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]] +; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[Z7]] +; CHECK-NEXT: ret i64 [[OR01234567]] ; %g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0 %g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1 @@ -97,18 +103,38 @@ ; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5 ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G0]] to <8 x i8>* -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: ret i64 [[TMP5]] +; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]] +; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]] +; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]] +; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]] +; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]] +; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]] +; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]] +; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]] +; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64 +; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64 +; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64 +; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64 +; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64 +; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64 +; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64 +; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64 +; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56 +; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48 +; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40 +; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32 +; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24 +; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16 +; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8 +; CHECK-NEXT: [[SH7:%.*]] = shl nuw nsw i64 [[Z7]], 0 +; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]] +; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]] +; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]] +; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]] +; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]] +; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]] +; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[SH7]] +; CHECK-NEXT: ret i64 [[OR01234567]] ; %g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0 %g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1 @@ -168,30 +194,36 @@ ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7 ; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G1]] to <4 x i8>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1 +; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1 +; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1 +; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1 ; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1 ; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1 ; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1 ; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64 +; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64 +; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64 +; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64 ; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64 ; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64 ; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw <4 x i64> [[TMP3]], +; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8 +; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16 +; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24 +; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32 ; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40 ; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48 ; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56 -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP5]], [[S5]] -; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], [[S6]] -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], [[S7]] -; CHECK-NEXT: [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z0]] -; CHECK-NEXT: ret i64 [[OP_EXTRA]] +; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[Z0]] +; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]] +; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]] +; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]] +; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]] +; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]] +; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]] +; CHECK-NEXT: ret i64 [[O7]] ; %g1 = getelementptr inbounds i8, i8* %arg, i64 1 %g2 = getelementptr inbounds i8, i8* %arg, i64 2 @@ -247,18 +279,38 @@ ; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 5 ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[ARG]] to <8 x i8>* -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: ret i64 [[TMP5]] +; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1 +; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1 +; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1 +; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1 +; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1 +; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1 +; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1 +; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1 +; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64 +; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64 +; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64 +; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64 +; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64 +; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64 +; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64 +; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64 +; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 0 +; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8 +; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16 +; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24 +; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32 +; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40 +; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48 +; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56 +; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[S0]] +; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]] +; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]] +; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]] +; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]] +; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]] +; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]] +; CHECK-NEXT: ret i64 [[O7]] ; %g1 = getelementptr inbounds i8, i8* %arg, i64 1 %g2 = getelementptr inbounds i8, i8* %arg, i64 2