Index: lib/Target/X86/X86InstrFragmentsSIMD.td
===================================================================
--- lib/Target/X86/X86InstrFragmentsSIMD.td
+++ lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -366,6 +366,13 @@
 def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
 def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
 
+// Scalar loads that are converted to 128-bit vector operands
+def loadf32_128 : PatFrag<(ops node:$ptr),
+  (bitconvert (v4f32 (scalar_to_vector (loadf32 node:$ptr))))>;
+
+def loadf64_128 : PatFrag<(ops node:$ptr),
+  (bitconvert (v2f64 (scalar_to_vector (loadf64 node:$ptr))))>;
+
 // Like 'store', but always requires 128-bit vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
                            (store node:$val, node:$ptr), [{
@@ -451,6 +458,15 @@
 def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
 def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
 
+// These are needed to match a scalar memop that is used in a vector-only
+// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
+// The memory operand is required to be a 128-bit load, so it must be converted
+// from a scalar to a vector.
+def memopfsf32_128 : PatFrag<(ops node:$ptr),
+  (bitconvert (v4f32 (scalar_to_vector (f32 (memop node:$ptr)))))>;
+def memopfsf64_128 : PatFrag<(ops node:$ptr),
+  (bitconvert (v2f64 (scalar_to_vector (f64 (memop node:$ptr)))))>;
+
 // 128-bit memop pattern fragments
 // NOTE: all 128-bit integer vector loads are promoted to v2i64
 def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -2874,21 +2874,19 @@
 multiclass sse12_fp_packed_scalar_logical_alias<
     bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
-              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>,
+              FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>,
               PS, VEX_4V;
 
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
-        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>,
-        PD, VEX_4V;
+              FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>,
+              PD, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
-                f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
-                PS;
+              f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS;
 
     defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
-                f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
-                PD;
+              f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD;
   }
 }
 
@@ -2910,12 +2908,10 @@
     bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
-              VR128, v4f32, f128mem, memopv4f32, SSEPackedSingle, itins, 0>,
-              PS, VEX_4V;
+      VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>, PS, VEX_4V;
 
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
-        VR128, v2f64, f128mem, memopv2f64, SSEPackedDouble, itins, 0>,
-        PD, VEX_4V;
+      VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>, PD, VEX_4V;
   }
 
   let Constraints = "$src1 = $dst" in {
Index: test/CodeGen/X86/logical-load-fold.ll
===================================================================
--- test/CodeGen/X86/logical-load-fold.ll
+++ test/CodeGen/X86/logical-load-fold.ll
@@ -0,0 +1,52 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=sse2,sse-unaligned-mem | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mcpu=x86-64 -mattr=avx                    | FileCheck %s --check-prefix=AVX
+
+; Although we have the ability to fold an unaligned load with AVX 
+; and under special conditions with some SSE implementations, we
+; can not fold the load under any circumstances in these test
+; cases because they are not 16-byte loads. The load must be
+; executed as a scalar ('movs*') with a zero extension to
+; 128-bits and then used in the packed logical ('andp*') op. 
+; PR22371 - http://llvm.org/bugs/show_bug.cgi?id=22371
+
+define double @load_double_no_fold(double %x, double %y) {
+; SSE2-LABEL: load_double_no_fold:
+; SSE2:       ## BB#0:
+; SSE2-NEXT:    cmplesd %xmm0, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    andpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: load_double_no_fold:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vcmplesd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %cmp = fcmp oge double %x, %y
+  %zext = zext i1 %cmp to i32
+  %conv = sitofp i32 %zext to double
+  ret double %conv
+}
+
+define float @load_float_no_fold(float %x, float %y) {
+; SSE2-LABEL: load_float_no_fold:
+; SSE2:       ## BB#0:
+; SSE2-NEXT:    cmpless %xmm0, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: load_float_no_fold:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vcmpless %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %cmp = fcmp oge float %x, %y
+  %zext = zext i1 %cmp to i32
+  %conv = sitofp i32 %zext to float
+  ret float %conv
+}
Index: test/CodeGen/X86/stack-align-vector-load.ll
===================================================================
--- test/CodeGen/X86/stack-align-vector-load.ll
+++ test/CodeGen/X86/stack-align-vector-load.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -relocation-model=static -mcpu=yonah | FileCheck %s
+
+; The double argument is at 4(esp) which is 16-byte aligned, allowing us to
+; fold the load into the andpd.
+
+; XFAIL: *
+; Although the load in this test can be folded because it meets the alignment requirement,
+; it becomes a 128-bit load. In most situations, that means we can't fold the load
+; because we'd be reading undefined memory. In this case, we know the extra 8 bytes on the
+; stack must be part of the { double, double } struct, so it should be safe to read
+; the extra bytes, but we need to teach the backend to recognize this case. 
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i686-apple-darwin8"
+@G = external global double
+
+define void @test({ double, double }* byval  %z, double* %P) nounwind {
+entry:
+	%tmp3 = load double* @G, align 16		; <double> [#uses=1]
+	%tmp4 = tail call double @fabs( double %tmp3 ) readnone	; <double> [#uses=1]
+        store volatile double %tmp4, double* %P
+	%tmp = getelementptr { double, double }* %z, i32 0, i32 0		; <double*> [#uses=1]
+	%tmp1 = load volatile double* %tmp, align 8		; <double> [#uses=1]
+	%tmp2 = tail call double @fabs( double %tmp1 ) readnone	; <double> [#uses=1]
+    ; CHECK: andpd{{.*}}4(%esp), %xmm
+	%tmp6 = fadd double %tmp4, %tmp2		; <double> [#uses=1]
+	store volatile double %tmp6, double* %P, align 8
+	ret void
+}
+
+declare double @fabs(double)
+
Index: test/CodeGen/X86/stack-align.ll
===================================================================
--- test/CodeGen/X86/stack-align.ll
+++ test/CodeGen/X86/stack-align.ll
@@ -1,26 +1,9 @@
 ; RUN: llc < %s -relocation-model=static -mcpu=yonah | FileCheck %s
 
-; The double argument is at 4(esp) which is 16-byte aligned, allowing us to
-; fold the load into the andpd.
-
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin8"
 @G = external global double
 
-define void @test({ double, double }* byval  %z, double* %P) nounwind {
-entry:
-	%tmp3 = load double* @G, align 16		; <double> [#uses=1]
-	%tmp4 = tail call double @fabs( double %tmp3 ) readnone	; <double> [#uses=1]
-        store volatile double %tmp4, double* %P
-	%tmp = getelementptr { double, double }* %z, i32 0, i32 0		; <double*> [#uses=1]
-	%tmp1 = load volatile double* %tmp, align 8		; <double> [#uses=1]
-	%tmp2 = tail call double @fabs( double %tmp1 ) readnone	; <double> [#uses=1]
-    ; CHECK: andpd{{.*}}4(%esp), %xmm
-	%tmp6 = fadd double %tmp4, %tmp2		; <double> [#uses=1]
-	store volatile double %tmp6, double* %P, align 8
-	ret void
-}
-
 define void @test2() alignstack(16) nounwind {
 entry:
     ; CHECK: andl{{.*}}$-16, %esp
@@ -36,8 +19,6 @@
     ret <2 x double> %A
 }
 
-declare double @fabs(double)
-
 ; The pointer is already known aligned, so and x,-16 is eliminable.
 define i32 @test4() nounwind {
 entry: