Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -965,6 +965,7 @@
                                  bool IsPairwiseForm) const;
   int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm,
                              bool IsUnsigned) const;
+  Optional<int> getLoadCombineCost(Value *V) const;
 
   /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
   /// Three cases are handled: 1. scalar instruction 2. vector instruction
@@ -1293,6 +1294,7 @@
                                          bool IsPairwiseForm) = 0;
   virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
                                      bool IsPairwiseForm, bool IsUnsigned) = 0;
+  virtual Optional<int> getLoadCombineCost(Value *V) = 0;
   virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                       ArrayRef<Type *> Tys, FastMathFlags FMF,
                       unsigned ScalarizationCostPassed) = 0;
@@ -1690,6 +1692,9 @@
                              bool IsPairwiseForm, bool IsUnsigned) override {
     return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned);
    }
+  Optional<int> getLoadCombineCost(Value *V) override {
+    return Impl.getLoadCombineCost(V);
+  }
   int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys,
                FastMathFlags FMF, unsigned ScalarizationCostPassed) override {
     return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -487,6 +487,8 @@
 
   unsigned getMinMaxReductionCost(Type *, Type *, bool, bool) { return 1; }
 
+  Optional<int> getLoadCombineCost(Value *V) { return llvm::None; }
+
   unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { return 0; }
 
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) {
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -39,6 +39,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCSchedule.h"
@@ -61,6 +62,7 @@
 class ScalarEvolution;
 class SCEV;
 class TargetMachine;
+using namespace llvm::PatternMatch;
 
 extern cl::opt<unsigned> PartialUnrollingThreshold;
 
@@ -1687,6 +1689,50 @@
            ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
   }
 
+  /// Given an input value that is an element of an 'or' reduction, check if the
+  /// reduction is composed of narrower loaded values. Assuming that a
+  /// legal-sized reduction of shifted/zexted loaded values can be load combined
+  /// in the backend, create a relative cost that accounts for the removal of
+  /// the intermediate ops and replacement by a single wide load.
+  /// TODO: If load combining is allowed in the IR optimizer, this analysis
+  ///       may not be necessary.
+  llvm::Optional<int> getLoadCombineCost(Value *V) {
+    // Look past the reduction to find a source value. Arbitrarily follow the
+    // path through operand 0 of any 'or'. Also, peek through optional
+    // shift-left-by-constant.
+    Value *ZextLoad = V;
+    while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
+           match(ZextLoad, m_Shl(m_Value(), m_Constant())))
+      ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
+
+    // Check if the input to the reduction is an extended load.
+    Value *LoadPtr;
+    if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+      return llvm::None;
+
+    // Require that the total load bit width is a legal integer type.
+    // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
+    // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
+    Type *WideType = V->getType();
+    Type *EltType = LoadPtr->getType()->getPointerElementType();
+    unsigned WideWidth = WideType->getIntegerBitWidth();
+    unsigned EltWidth = EltType->getIntegerBitWidth();
+    if (!isTypeLegal(WideType) || WideWidth % EltWidth != 0)
+      return llvm::None;
+
+    // Calculate relative cost: {narrow load+zext+shl+or} are assumed to be
+    // removed for each element and replaced by a single wide load.
+    auto *ConcreteTTI = static_cast<T *>(this);
+    int Cost = 0;
+    Cost -= ConcreteTTI->getMemoryOpCost(Instruction::Load, EltType, 0, 0);
+    Cost -= ConcreteTTI->getCastInstrCost(Instruction::ZExt, WideType, EltType);
+    Cost -= ConcreteTTI->getArithmeticInstrCost(Instruction::Shl, WideType);
+    Cost -= ConcreteTTI->getArithmeticInstrCost(Instruction::Or, WideType);
+    Cost *= (WideWidth / EltWidth); // Scale by number of elements.
+    Cost += ConcreteTTI->getMemoryOpCost(Instruction::Load, WideType, 0, 0);
+    return Cost;
+  }
+
   unsigned getVectorSplitCost() { return 1; }
 
   /// @}
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -726,6 +726,10 @@
   return Cost;
 }
 
+Optional<int> TargetTransformInfo::getLoadCombineCost(Value *V) const {
+  return TTIImpl->getLoadCombineCost(V);
+}
+
 unsigned
 TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {
   return TTIImpl->getCostOfKeepingLiveOverCall(Tys);
Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6498,10 +6498,18 @@
     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
 
     int ScalarReduxCost = 0;
+    Optional<int> LoadCombineCost;
     switch (ReductionData.getKind()) {
     case RK_Arithmetic:
-      ScalarReduxCost =
-          TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
+      if (ReductionData.getOpcode() == Instruction::Or)
+        LoadCombineCost = TTI->getLoadCombineCost(FirstReducedVal);
+      if (LoadCombineCost) {
+        LLVM_DEBUG(dbgs() << "SLP: Assume load combine for scalar reduction of "
+                   << *(cast<Instruction>(ReductionRoot)) << "\n");
+      } else {
+        ScalarReduxCost =
+            TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
+      }
       break;
     case RK_Min:
     case RK_Max:
@@ -6515,7 +6523,10 @@
     case RK_None:
       llvm_unreachable("Expected arithmetic or min/max reduction operation");
     }
-    ScalarReduxCost *= (ReduxWidth - 1);
+    if (LoadCombineCost)
+      ScalarReduxCost = LoadCombineCost.getValue();
+    else
+      ScalarReduxCost *= (ReduxWidth - 1);
 
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
                       << " for reduction that starts with " << *FirstReducedVal
Index: llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
@@ -15,31 +15,37 @@
 ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
 ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
 ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[G0]] to <4 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[T0:%.*]] = load i8, i8* [[G0]]
+; CHECK-NEXT:    [[T1:%.*]] = load i8, i8* [[G1]]
+; CHECK-NEXT:    [[T2:%.*]] = load i8, i8* [[G2]]
+; CHECK-NEXT:    [[T3:%.*]] = load i8, i8* [[G3]]
 ; CHECK-NEXT:    [[T4:%.*]] = load i8, i8* [[G4]]
 ; CHECK-NEXT:    [[T5:%.*]] = load i8, i8* [[G5]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i8, i8* [[G6]]
 ; CHECK-NEXT:    [[T7:%.*]] = load i8, i8* [[G7]]
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[Z0:%.*]] = zext i8 [[T0]] to i64
+; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[T1]] to i64
+; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[T2]] to i64
+; CHECK-NEXT:    [[Z3:%.*]] = zext i8 [[T3]] to i64
 ; CHECK-NEXT:    [[Z4:%.*]] = zext i8 [[T4]] to i64
 ; CHECK-NEXT:    [[Z5:%.*]] = zext i8 [[T5]] to i64
 ; CHECK-NEXT:    [[Z6:%.*]] = zext i8 [[T6]] to i64
 ; CHECK-NEXT:    [[Z7:%.*]] = zext i8 [[T7]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw <4 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32>
+; CHECK-NEXT:    [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
+; CHECK-NEXT:    [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
+; CHECK-NEXT:    [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
+; CHECK-NEXT:    [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
 ; CHECK-NEXT:    [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
 ; CHECK-NEXT:    [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
 ; CHECK-NEXT:    [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP5]], [[SH4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], [[SH5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], [[SH6]]
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z7]]
-; CHECK-NEXT:    ret i64 [[OP_EXTRA]]
+; CHECK-NEXT:    [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
+; CHECK-NEXT:    [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
+; CHECK-NEXT:    [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
+; CHECK-NEXT:    [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
+; CHECK-NEXT:    [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
+; CHECK-NEXT:    [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
+; CHECK-NEXT:    [[OR01234567:%.*]] = or i64 [[OR0123456]], [[Z7]]
+; CHECK-NEXT:    ret i64 [[OR01234567]]
 ;
   %g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
   %g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
@@ -97,18 +103,38 @@
 ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
 ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
 ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[G0]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    ret i64 [[TMP5]]
+; CHECK-NEXT:    [[T0:%.*]] = load i8, i8* [[G0]]
+; CHECK-NEXT:    [[T1:%.*]] = load i8, i8* [[G1]]
+; CHECK-NEXT:    [[T2:%.*]] = load i8, i8* [[G2]]
+; CHECK-NEXT:    [[T3:%.*]] = load i8, i8* [[G3]]
+; CHECK-NEXT:    [[T4:%.*]] = load i8, i8* [[G4]]
+; CHECK-NEXT:    [[T5:%.*]] = load i8, i8* [[G5]]
+; CHECK-NEXT:    [[T6:%.*]] = load i8, i8* [[G6]]
+; CHECK-NEXT:    [[T7:%.*]] = load i8, i8* [[G7]]
+; CHECK-NEXT:    [[Z0:%.*]] = zext i8 [[T0]] to i64
+; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[T1]] to i64
+; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[T2]] to i64
+; CHECK-NEXT:    [[Z3:%.*]] = zext i8 [[T3]] to i64
+; CHECK-NEXT:    [[Z4:%.*]] = zext i8 [[T4]] to i64
+; CHECK-NEXT:    [[Z5:%.*]] = zext i8 [[T5]] to i64
+; CHECK-NEXT:    [[Z6:%.*]] = zext i8 [[T6]] to i64
+; CHECK-NEXT:    [[Z7:%.*]] = zext i8 [[T7]] to i64
+; CHECK-NEXT:    [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
+; CHECK-NEXT:    [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
+; CHECK-NEXT:    [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
+; CHECK-NEXT:    [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
+; CHECK-NEXT:    [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
+; CHECK-NEXT:    [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
+; CHECK-NEXT:    [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
+; CHECK-NEXT:    [[SH7:%.*]] = shl nuw nsw i64 [[Z7]], 0
+; CHECK-NEXT:    [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
+; CHECK-NEXT:    [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
+; CHECK-NEXT:    [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
+; CHECK-NEXT:    [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
+; CHECK-NEXT:    [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
+; CHECK-NEXT:    [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
+; CHECK-NEXT:    [[OR01234567:%.*]] = or i64 [[OR0123456]], [[SH7]]
+; CHECK-NEXT:    ret i64 [[OR01234567]]
 ;
   %g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
   %g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
@@ -168,30 +194,36 @@
 ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
 ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
 ; CHECK-NEXT:    [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[G1]] to <4 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, i8* [[G1]], align 1
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, i8* [[G2]], align 1
+; CHECK-NEXT:    [[LD3:%.*]] = load i8, i8* [[G3]], align 1
+; CHECK-NEXT:    [[LD4:%.*]] = load i8, i8* [[G4]], align 1
 ; CHECK-NEXT:    [[LD5:%.*]] = load i8, i8* [[G5]], align 1
 ; CHECK-NEXT:    [[LD6:%.*]] = load i8, i8* [[G6]], align 1
 ; CHECK-NEXT:    [[LD7:%.*]] = load i8, i8* [[G7]], align 1
 ; CHECK-NEXT:    [[Z0:%.*]] = zext i8 [[LD0]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[LD1]] to i64
+; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[LD2]] to i64
+; CHECK-NEXT:    [[Z3:%.*]] = zext i8 [[LD3]] to i64
+; CHECK-NEXT:    [[Z4:%.*]] = zext i8 [[LD4]] to i64
 ; CHECK-NEXT:    [[Z5:%.*]] = zext i8 [[LD5]] to i64
 ; CHECK-NEXT:    [[Z6:%.*]] = zext i8 [[LD6]] to i64
 ; CHECK-NEXT:    [[Z7:%.*]] = zext i8 [[LD7]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw <4 x i64> [[TMP3]], <i64 8, i64 16, i64 24, i64 32>
+; CHECK-NEXT:    [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
+; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
+; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
+; CHECK-NEXT:    [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
 ; CHECK-NEXT:    [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
 ; CHECK-NEXT:    [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
 ; CHECK-NEXT:    [[S7:%.*]] = shl nuw i64 [[Z7]], 56
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP5]], [[S5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], [[S6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], [[S7]]
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z0]]
-; CHECK-NEXT:    ret i64 [[OP_EXTRA]]
+; CHECK-NEXT:    [[O1:%.*]] = or i64 [[S1]], [[Z0]]
+; CHECK-NEXT:    [[O2:%.*]] = or i64 [[O1]], [[S2]]
+; CHECK-NEXT:    [[O3:%.*]] = or i64 [[O2]], [[S3]]
+; CHECK-NEXT:    [[O4:%.*]] = or i64 [[O3]], [[S4]]
+; CHECK-NEXT:    [[O5:%.*]] = or i64 [[O4]], [[S5]]
+; CHECK-NEXT:    [[O6:%.*]] = or i64 [[O5]], [[S6]]
+; CHECK-NEXT:    [[O7:%.*]] = or i64 [[O6]], [[S7]]
+; CHECK-NEXT:    ret i64 [[O7]]
 ;
   %g1 = getelementptr inbounds i8, i8* %arg, i64 1
   %g2 = getelementptr inbounds i8, i8* %arg, i64 2
@@ -247,18 +279,38 @@
 ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 5
 ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
 ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[ARG]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    ret i64 [[TMP5]]
+; CHECK-NEXT:    [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, i8* [[G1]], align 1
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, i8* [[G2]], align 1
+; CHECK-NEXT:    [[LD3:%.*]] = load i8, i8* [[G3]], align 1
+; CHECK-NEXT:    [[LD4:%.*]] = load i8, i8* [[G4]], align 1
+; CHECK-NEXT:    [[LD5:%.*]] = load i8, i8* [[G5]], align 1
+; CHECK-NEXT:    [[LD6:%.*]] = load i8, i8* [[G6]], align 1
+; CHECK-NEXT:    [[LD7:%.*]] = load i8, i8* [[G7]], align 1
+; CHECK-NEXT:    [[Z0:%.*]] = zext i8 [[LD0]] to i64
+; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[LD1]] to i64
+; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[LD2]] to i64
+; CHECK-NEXT:    [[Z3:%.*]] = zext i8 [[LD3]] to i64
+; CHECK-NEXT:    [[Z4:%.*]] = zext i8 [[LD4]] to i64
+; CHECK-NEXT:    [[Z5:%.*]] = zext i8 [[LD5]] to i64
+; CHECK-NEXT:    [[Z6:%.*]] = zext i8 [[LD6]] to i64
+; CHECK-NEXT:    [[Z7:%.*]] = zext i8 [[LD7]] to i64
+; CHECK-NEXT:    [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 0
+; CHECK-NEXT:    [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
+; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
+; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
+; CHECK-NEXT:    [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
+; CHECK-NEXT:    [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
+; CHECK-NEXT:    [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
+; CHECK-NEXT:    [[S7:%.*]] = shl nuw i64 [[Z7]], 56
+; CHECK-NEXT:    [[O1:%.*]] = or i64 [[S1]], [[S0]]
+; CHECK-NEXT:    [[O2:%.*]] = or i64 [[O1]], [[S2]]
+; CHECK-NEXT:    [[O3:%.*]] = or i64 [[O2]], [[S3]]
+; CHECK-NEXT:    [[O4:%.*]] = or i64 [[O3]], [[S4]]
+; CHECK-NEXT:    [[O5:%.*]] = or i64 [[O4]], [[S5]]
+; CHECK-NEXT:    [[O6:%.*]] = or i64 [[O5]], [[S6]]
+; CHECK-NEXT:    [[O7:%.*]] = or i64 [[O6]], [[S7]]
+; CHECK-NEXT:    ret i64 [[O7]]
 ;
   %g1 = getelementptr inbounds i8, i8* %arg, i64 1
   %g2 = getelementptr inbounds i8, i8* %arg, i64 2