Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -366,6 +366,13 @@ def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; +// Scalar loads that are converted to 128-bit vector operands +def loadf32_128 : PatFrag<(ops node:$ptr), + (bitconvert (v4f32 (scalar_to_vector (loadf32 node:$ptr))))>; + +def loadf64_128 : PatFrag<(ops node:$ptr), + (bitconvert (v2f64 (scalar_to_vector (loadf64 node:$ptr))))>; + // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ @@ -451,6 +458,15 @@ def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>; def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>; +// These are needed to match a scalar memop that is used in a vector-only +// math instruction such as the FP logical ops: andps, andnps, orps, xorps. +// The memory operand is required to be a 128-bit load, so it must be converted +// from a scalar to a vector. +def memopfsf32_128 : PatFrag<(ops node:$ptr), + (bitconvert (v4f32 (scalar_to_vector (f32 (memop node:$ptr)))))>; +def memopfsf64_128 : PatFrag<(ops node:$ptr), + (bitconvert (v2f64 (scalar_to_vector (f64 (memop node:$ptr)))))>; + // 128-bit memop pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -2874,21 +2874,19 @@ multiclass sse12_fp_packed_scalar_logical_alias< bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>, + FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>, PS, VEX_4V; defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>, - PD, VEX_4V; + FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>, + PD, VEX_4V; let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, - f32, f128mem, memopfsf32, SSEPackedSingle, itins>, - PS; + f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS; defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, - f64, f128mem, memopfsf64, SSEPackedDouble, itins>, - PD; + f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD; } } @@ -2910,12 +2908,10 @@ bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { let Predicates = [HasAVX, NoVLX] in { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - VR128, v4f32, f128mem, memopv4f32, SSEPackedSingle, itins, 0>, - PS, VEX_4V; + VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>, PS, VEX_4V; defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - VR128, v2f64, f128mem, memopv2f64, SSEPackedDouble, itins, 0>, - PD, VEX_4V; + VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>, PD, VEX_4V; } let Constraints = "$src1 = $dst" in { Index: test/CodeGen/X86/logical-load-fold.ll =================================================================== --- test/CodeGen/X86/logical-load-fold.ll +++ test/CodeGen/X86/logical-load-fold.ll @@ -0,0 +1,52 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=sse2,sse-unaligned-mem | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mcpu=x86-64 -mattr=avx | FileCheck %s --check-prefix=AVX + +; Although we have the ability to fold an unaligned load with AVX +; and under special conditions with some SSE implementations, we +; can not fold the load under any circumstances in these test +; cases because they are not 16-byte loads. The load must be +; executed as a scalar ('movs*') with a zero extension to +; 128-bits and then used in the packed logical ('andp*') op. +; PR22371 - http://llvm.org/bugs/show_bug.cgi?id=22371 + +define double @load_double_no_fold(double %x, double %y) { +; SSE2-LABEL: load_double_no_fold: +; SSE2: ## BB#0: +; SSE2-NEXT: cmplesd %xmm0, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: andpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: load_double_no_fold: +; AVX: ## BB#0: +; AVX-NEXT: vcmplesd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + + %cmp = fcmp oge double %x, %y + %zext = zext i1 %cmp to i32 + %conv = sitofp i32 %zext to double + ret double %conv +} + +define float @load_float_no_fold(float %x, float %y) { +; SSE2-LABEL: load_float_no_fold: +; SSE2: ## BB#0: +; SSE2-NEXT: cmpless %xmm0, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: load_float_no_fold: +; AVX: ## BB#0: +; AVX-NEXT: vcmpless %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + + %cmp = fcmp oge float %x, %y + %zext = zext i1 %cmp to i32 + %conv = sitofp i32 %zext to float + ret float %conv +} Index: test/CodeGen/X86/stack-align-vector-load.ll =================================================================== --- test/CodeGen/X86/stack-align-vector-load.ll +++ test/CodeGen/X86/stack-align-vector-load.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s -relocation-model=static -mcpu=yonah | FileCheck %s + +; The double argument is at 4(esp) which is 16-byte aligned, allowing us to +; fold the load into the andpd. + +; XFAIL: * +; Although the load in this test can be folded because it meets the alignment requirement, +; it becomes a 128-bit load. In most situations, that means we can't fold the load +; because we'd be reading undefined memory. In this case, we know the extra 8 bytes on the +; stack must be part of the { double, double } struct, so it should be safe to read +; the extra bytes, but we need to teach the backend to recognize this case. + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target triple = "i686-apple-darwin8" +@G = external global double + +define void @test({ double, double }* byval %z, double* %P) nounwind { +entry: + %tmp3 = load double* @G, align 16 ; <double> [#uses=1] + %tmp4 = tail call double @fabs( double %tmp3 ) readnone ; <double> [#uses=1] + store volatile double %tmp4, double* %P + %tmp = getelementptr { double, double }* %z, i32 0, i32 0 ; <double*> [#uses=1] + %tmp1 = load volatile double* %tmp, align 8 ; <double> [#uses=1] + %tmp2 = tail call double @fabs( double %tmp1 ) readnone ; <double> [#uses=1] + ; CHECK: andpd{{.*}}4(%esp), %xmm + %tmp6 = fadd double %tmp4, %tmp2 ; <double> [#uses=1] + store volatile double %tmp6, double* %P, align 8 + ret void +} + +declare double @fabs(double) + Index: test/CodeGen/X86/stack-align.ll =================================================================== --- test/CodeGen/X86/stack-align.ll +++ test/CodeGen/X86/stack-align.ll @@ -1,26 +1,9 @@ ; RUN: llc < %s -relocation-model=static -mcpu=yonah | FileCheck %s -; The double argument is at 4(esp) which is 16-byte aligned, allowing us to -; fold the load into the andpd. - target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i686-apple-darwin8" @G = external global double -define void @test({ double, double }* byval %z, double* %P) nounwind { -entry: - %tmp3 = load double* @G, align 16 ; <double> [#uses=1] - %tmp4 = tail call double @fabs( double %tmp3 ) readnone ; <double> [#uses=1] - store volatile double %tmp4, double* %P - %tmp = getelementptr { double, double }* %z, i32 0, i32 0 ; <double*> [#uses=1] - %tmp1 = load volatile double* %tmp, align 8 ; <double> [#uses=1] - %tmp2 = tail call double @fabs( double %tmp1 ) readnone ; <double> [#uses=1] - ; CHECK: andpd{{.*}}4(%esp), %xmm - %tmp6 = fadd double %tmp4, %tmp2 ; <double> [#uses=1] - store volatile double %tmp6, double* %P, align 8 - ret void -} - define void @test2() alignstack(16) nounwind { entry: ; CHECK: andl{{.*}}$-16, %esp @@ -36,8 +19,6 @@ ret <2 x double> %A } -declare double @fabs(double) - ; The pointer is already known aligned, so and x,-16 is eliminable. define i32 @test4() nounwind { entry: