Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -574,6 +574,10 @@
   /// Returns (and creates if needed) the trip count of the widened loop.
   Value *getOrCreateVectorTripCount(Loop *NewLoop);
 
+  /// Returns a bitcasted value to the requested vector type.
+  /// Also handles bitcasts of float <--> pointer types.
+  Value* createBitCast(Value *V, VectorType* DstVTy, const DataLayout& DL);
+
   /// Emit a bypass check to see if the vector trip count is zero, including if
   /// it overflows.
   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
@@ -2852,6 +2856,7 @@
   if (Instr != Group->getInsertPos())
     return;
 
+  const DataLayout &DL = Instr->getModule()->getDataLayout();
   Value *Ptr = getPointerOperand(Instr);
 
   // Prepare for the vector type of the interleaved load/store.
@@ -2926,7 +2931,7 @@
         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
-          StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
+          StridedVec = createBitCast(StridedVec, OtherVTy, DL);
         }
 
         if (Group->isReverse())
@@ -2955,9 +2960,10 @@
       if (Group->isReverse())
         StoredVec = reverseVector(StoredVec);
 
-      // If this member has different type, cast it to an unified type.
+      // If this member has different type, cast it to a unified type.
+
       if (StoredVec->getType() != SubVT)
-        StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
+        StoredVec = createBitCast(StoredVec, SubVT, DL);
 
       StoredVecs.push_back(StoredVec);
     }
@@ -3292,6 +3298,40 @@
   return VectorTripCount;
 }
 
+Value* InnerLoopVectorizer::createBitCast(Value *V, VectorType* DstVTy,
+                                          const DataLayout& DL) {
+  // Do a direct cast if a safe direct cast is possible.
+  if (CastInst::isBitOrNoopPointerCastable(V->getType(), DstVTy, DL)) {
+    return Builder.CreateBitOrPointerCast(V, DstVTy);
+  }
+  // Verify that V is a vector type with same number of elements as DstVTy.
+  unsigned VF = DstVTy->getNumElements();
+  VectorType *SrcVecTy = dyn_cast<VectorType>(V->getType());
+  assert(SrcVecTy);
+  assert(VF == SrcVecTy->getNumElements());
+  Type *SrcElemTy = SrcVecTy->getElementType();
+  Type *DstElemTy = DstVTy->getElementType();
+  assert(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy));
+
+  // The previous castable check does not cover the bitcasts between
+  // vector<int> and vector<ptr> types and may fail. So try another time
+  // but using element types.
+  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
+    return Builder.CreateBitOrPointerCast(V, DstVTy);
+  }
+  // V cannot be directly casted to desired vector type.
+  // May happen when V is a floating point vector but DstVTy is a vector of pointers
+  // or vice-versa. Handle this using a two-step bitcast using an intermediate Integer
+  // type for the bitcast i.e. Ptr <-> Int <-> Float.
+  assert(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy());
+  assert(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy());
+  Type *IntTy = IntegerType::getIntNTy(V->getContext(),
+                                   DL.getTypeSizeInBits(SrcElemTy));
+  VectorType *VecIntTy = VectorType::get(IntTy, VF);
+  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
+  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
+}
+
 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
                                                          BasicBlock *Bypass) {
   Value *Count = getOrCreateTripCount(L);
Index: test/CodeGen/ARM/loopvectorize_pr33804_1.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/loopvectorize_pr33804_1.ll
@@ -0,0 +1,90 @@
+; RUN: opt -loop-vectorize -S < %s | FileCheck %s
+
+; This checks we don't crash when vectorizing if vectorizer ends up
+; requiring casting float to a pointer type.
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+source_filename = "bugpoint-output-26dbd81.bc"
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+%struct.CvNode1D = type { float, %struct.CvNode1D* }
+
+@.str.13 = external unnamed_addr constant [1 x i8], align 1
+
+; CHECK-LABEL: @cvCalcEMD2
+; CHECK: vector.body
+; CHECK: store <8 x %struct.CvNode1D*>
+define void @cvCalcEMD2() local_unnamed_addr #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @cvGetMat()
+          to label %invoke.cont unwind label %lpad.loopexit.split-lp
+
+invoke.cont:                                      ; preds = %entry
+  invoke void @cvGetMat()
+          to label %invoke.cont3 unwind label %lpad.loopexit.split-lp
+
+invoke.cont3:                                     ; preds = %invoke.cont
+  invoke void @_Znaj() #3
+          to label %call.i.i.i1408.noexc unwind label %lpad.loopexit.split-lp
+
+lpad.loopexit.split-lp:                           ; preds = %invoke.cont3, %entry, %invoke.cont
+  %lpad.loopexit.split-lp2387 = landingpad { i8*, i32 }
+          cleanup
+  resume { i8*, i32 } undef
+
+call.i.i.i1408.noexc:                             ; preds = %invoke.cont3
+  invoke void @_ZNSsC1EPKcRKSaIcE()
+          to label %invoke.cont188.i unwind label %lpad187.i
+
+invoke.cont188.i:                                 ; preds = %call.i.i.i1408.noexc
+  br label %invoke.cont203.i
+
+invoke.cont203.i:                                 ; preds = %invoke.cont188.i
+  invoke void @_ZN2cv5errorERKNS_9ExceptionE()
+          to label %invoke.cont206.i unwind label %lpad205.i
+
+invoke.cont206.i:                                 ; preds = %invoke.cont203.i
+  br label %for.body14.i.i
+
+lpad187.i:                                        ; preds = %call.i.i.i1408.noexc
+  %0 = landingpad { i8*, i32 }
+          cleanup
+  unreachable
+
+lpad205.i:                                        ; preds = %invoke.cont203.i
+  %1 = landingpad { i8*, i32 }
+          cleanup
+  unreachable
+
+for.body14.i.i:                                   ; preds = %for.body14.i.i, %invoke.cont206.i
+  %i.1424.i.i = phi i32 [ %inc21.i.i, %for.body14.i.i ], [ 0, %invoke.cont206.i ]
+  %arrayidx15.i.i1427 = getelementptr inbounds %struct.CvNode1D, %struct.CvNode1D* undef, i32 %i.1424.i.i
+  %val.i.i = getelementptr inbounds %struct.CvNode1D, %struct.CvNode1D* %arrayidx15.i.i1427, i32 0, i32 0
+  store float 0xC415AF1D80000000, float* %val.i.i, align 4
+  %next19.i.i = getelementptr inbounds %struct.CvNode1D, %struct.CvNode1D* undef, i32 %i.1424.i.i, i32 1
+  store %struct.CvNode1D* undef, %struct.CvNode1D** %next19.i.i, align 4
+  %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1
+  %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0
+  br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i
+
+for.end22.i.i:                                    ; preds = %for.body14.i.i
+  unreachable
+}
+
+declare void @cvGetMat() local_unnamed_addr #1
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_ZN2cv5errorERKNS_9ExceptionE() local_unnamed_addr #1
+
+declare void @_ZNSsC1EPKcRKSaIcE() unnamed_addr #1
+
+; Function Attrs: nobuiltin
+declare void @_Znaj() local_unnamed_addr #2
+
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { builtin }
+
Index: test/CodeGen/ARM/loopvectorize_pr33804_2.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/loopvectorize_pr33804_2.ll
@@ -0,0 +1,90 @@
+; RUN: opt -loop-vectorize -S < %s | FileCheck %s
+
+; This checks we don't crash when vectorizing if vectorizer ends up
+; requiring casting pointer to a float type.
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+source_filename = "bugpoint-output-26dbd81.bc"
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+%struct.CvNode1D = type { %struct.CvNode1D*, float }
+
+@.str.13 = external unnamed_addr constant [1 x i8], align 1
+
+; CHECK-LABEL: @cvCalcEMD2
+; CHECK: vector.body
+; CHECK: store <8 x float>
+define void @cvCalcEMD2() local_unnamed_addr #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @cvGetMat()
+          to label %invoke.cont unwind label %lpad.loopexit.split-lp
+
+invoke.cont:                                      ; preds = %entry
+  invoke void @cvGetMat()
+          to label %invoke.cont3 unwind label %lpad.loopexit.split-lp
+
+invoke.cont3:                                     ; preds = %invoke.cont
+  invoke void @_Znaj() #3
+          to label %call.i.i.i1408.noexc unwind label %lpad.loopexit.split-lp
+
+lpad.loopexit.split-lp:                           ; preds = %invoke.cont3, %entry, %invoke.cont
+  %lpad.loopexit.split-lp2387 = landingpad { i8*, i32 }
+          cleanup
+  resume { i8*, i32 } undef
+
+call.i.i.i1408.noexc:                             ; preds = %invoke.cont3
+  invoke void @_ZNSsC1EPKcRKSaIcE()
+          to label %invoke.cont188.i unwind label %lpad187.i
+
+invoke.cont188.i:                                 ; preds = %call.i.i.i1408.noexc
+  br label %invoke.cont203.i
+
+invoke.cont203.i:                                 ; preds = %invoke.cont188.i
+  invoke void @_ZN2cv5errorERKNS_9ExceptionE()
+          to label %invoke.cont206.i unwind label %lpad205.i
+
+invoke.cont206.i:                                 ; preds = %invoke.cont203.i
+  br label %for.body14.i.i
+
+lpad187.i:                                        ; preds = %call.i.i.i1408.noexc
+  %0 = landingpad { i8*, i32 }
+          cleanup
+  unreachable
+
+lpad205.i:                                        ; preds = %invoke.cont203.i
+  %1 = landingpad { i8*, i32 }
+          cleanup
+  unreachable
+
+for.body14.i.i:                                   ; preds = %for.body14.i.i, %invoke.cont206.i
+  %i.1424.i.i = phi i32 [ %inc21.i.i, %for.body14.i.i ], [ 0, %invoke.cont206.i ]
+  %next19.i.i = getelementptr inbounds %struct.CvNode1D, %struct.CvNode1D* undef, i32 %i.1424.i.i, i32 0
+  store %struct.CvNode1D* undef, %struct.CvNode1D** %next19.i.i, align 4
+  %arrayidx15.i.i1427 = getelementptr inbounds %struct.CvNode1D, %struct.CvNode1D* undef, i32 %i.1424.i.i
+  %val.i.i = getelementptr inbounds %struct.CvNode1D, %struct.CvNode1D* %arrayidx15.i.i1427, i32 0, i32 1
+  store float 0xC415AF1D80000000, float* %val.i.i, align 4
+  %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1
+  %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0
+  br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i
+
+for.end22.i.i:                                    ; preds = %for.body14.i.i
+  unreachable
+}
+
+declare void @cvGetMat() local_unnamed_addr #1
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_ZN2cv5errorERKNS_9ExceptionE() local_unnamed_addr #1
+
+declare void @_ZNSsC1EPKcRKSaIcE() unnamed_addr #1
+
+; Function Attrs: nobuiltin
+declare void @_Znaj() local_unnamed_addr #2
+
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { builtin }
+