Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -366,6 +366,15 @@ def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; +// These are needed to match a scalar load that is used in a vector-only +// math instruction such as the FP logical ops: andps, andnps, orps, xorps. +// The memory operand is required to be a 128-bit load, so it must be converted +// from a scalar to a vector. +def loadf32_128 : PatFrag<(ops node:$ptr), + (bitconvert (v4f32 (scalar_to_vector (loadf32 node:$ptr))))>; +def loadf64_128 : PatFrag<(ops node:$ptr), + (bitconvert (v2f64 (scalar_to_vector (loadf64 node:$ptr))))>; + // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ @@ -451,6 +460,15 @@ def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>; def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>; +// These are needed to match a scalar memop that is used in a vector-only +// math instruction such as the FP logical ops: andps, andnps, orps, xorps. +// The memory operand is required to be a 128-bit load, so it must be converted +// from a scalar to a vector. +def memopfsf32_128 : PatFrag<(ops node:$ptr), + (bitconvert (v4f32 (scalar_to_vector (f32 (memop node:$ptr)))))>; +def memopfsf64_128 : PatFrag<(ops node:$ptr), + (bitconvert (v2f64 (scalar_to_vector (f64 (memop node:$ptr)))))>; + // 128-bit memop pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -933,6 +933,11 @@ { X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 }, { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 }, { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 }, + + // FIXME: We should not be folding Fs* scalar loads into vector + // instructions here because the vector instructions require vector-sized + // loads. Earlier passes are creating these instructions with the wrong + // type. They should probably be generating the Fv* variants below. { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 }, { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 }, { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 }, @@ -941,6 +946,15 @@ { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 }, { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 }, { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 }, + + { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 }, + { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 }, + { X86::FvANDPDrr, X86::FvANDPDrm, TB_ALIGN_16 }, + { X86::FvANDPSrr, X86::FvANDPSrm, TB_ALIGN_16 }, + { X86::FvORPDrr, X86::FvORPDrm, TB_ALIGN_16 }, + { X86::FvORPSrr, X86::FvORPSrm, TB_ALIGN_16 }, + { X86::FvXORPDrr, X86::FvXORPDrm, TB_ALIGN_16 }, + { X86::FvXORPSrr, X86::FvXORPSrm, TB_ALIGN_16 }, { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 }, { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 }, { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 }, @@ -1142,14 +1156,16 @@ { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 }, { X86::VDPPDrri, X86::VDPPDrmi, 0 }, { X86::VDPPSrri, X86::VDPPSrmi, 0 }, - { X86::VFsANDNPDrr, X86::VFsANDNPDrm, 0 }, - { X86::VFsANDNPSrr, X86::VFsANDNPSrm, 0 }, - { X86::VFsANDPDrr, X86::VFsANDPDrm, 0 }, - { X86::VFsANDPSrr, X86::VFsANDPSrm, 0 }, - { X86::VFsORPDrr, X86::VFsORPDrm, 0 }, - { X86::VFsORPSrr, X86::VFsORPSrm, 0 }, - { X86::VFsXORPDrr, X86::VFsXORPDrm, 0 }, - { X86::VFsXORPSrr, X86::VFsXORPSrm, 0 }, + // Do not fold VFs* loads because there are no scalar load variants for + // these instructions; the load size would not match. + { X86::VFvANDNPDrr, X86::VFvANDNPDrm, 0 }, + { X86::VFvANDNPSrr, X86::VFvANDNPSrm, 0 }, + { X86::VFvANDPDrr, X86::VFvANDPDrm, 0 }, + { X86::VFvANDPSrr, X86::VFvANDPSrm, 0 }, + { X86::VFvORPDrr, X86::VFvORPDrm, 0 }, + { X86::VFvORPSrr, X86::VFvORPSrm, 0 }, + { X86::VFvXORPDrr, X86::VFvXORPDrm, 0 }, + { X86::VFvXORPSrr, X86::VFvXORPSrm, 0 }, { X86::VHADDPDrr, X86::VHADDPDrm, 0 }, { X86::VHADDPSrr, X86::VHADDPSrm, 0 }, { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 }, Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -2874,21 +2874,19 @@ multiclass sse12_fp_packed_scalar_logical_alias< bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { defm V#NAME#PS : sse12_fp_packed, - PS, VEX_4V; + FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>, + PS, VEX_4V; defm V#NAME#PD : sse12_fp_packed, - PD, VEX_4V; + FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>, + PD, VEX_4V; let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed, - PS; + f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS; defm PD : sse12_fp_packed, - PD; + f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD; } } Index: test/CodeGen/X86/stack-align.ll =================================================================== --- test/CodeGen/X86/stack-align.ll +++ test/CodeGen/X86/stack-align.ll @@ -1,7 +1,10 @@ ; RUN: llc < %s -relocation-model=static -mcpu=yonah | FileCheck %s -; The double argument is at 4(esp) which is 16-byte aligned, allowing us to -; fold the load into the andpd. +; The double argument is at 4(esp) which is 16-byte aligned, but we +; are required to read in extra bytes of memory in order to fold the +; load. Bad Things may happen when reading/processing undefined bytes, +; so don't fold the load. +; PR22371 / http://reviews.llvm.org/D7474 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i686-apple-darwin8" @@ -15,22 +18,31 @@ %tmp = getelementptr { double, double }* %z, i32 0, i32 0 ; [#uses=1] %tmp1 = load volatile double* %tmp, align 8 ; [#uses=1] %tmp2 = tail call double @fabs( double %tmp1 ) readnone ; [#uses=1] - ; CHECK: andpd{{.*}}4(%esp), %xmm %tmp6 = fadd double %tmp4, %tmp2 ; [#uses=1] store volatile double %tmp6, double* %P, align 8 ret void + +; CHECK-LABEL: test: +; CHECK: movsd {{.*}}G, %xmm{{.*}} +; CHECK: andpd %xmm{{.*}}, %xmm{{.*}} +; CHECK: movsd 4(%esp), %xmm{{.*}} +; CHECK: andpd %xmm{{.*}}, %xmm{{.*}} + + } define void @test2() alignstack(16) nounwind { entry: - ; CHECK: andl{{.*}}$-16, %esp +; CHECK-LABEL: test2: +; CHECK: andl{{.*}}$-16, %esp ret void } ; Use a call to force a spill. define <2 x double> @test3(<2 x double> %x, <2 x double> %y) alignstack(32) nounwind { entry: - ; CHECK: andl{{.*}}$-32, %esp +; CHECK-LABEL: test3: +; CHECK: andl{{.*}}$-32, %esp call void @test2() %A = fmul <2 x double> %x, %y ret <2 x double> %A