Index: llvm/trunk/lib/Target/X86/X86FastISel.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86FastISel.cpp +++ llvm/trunk/lib/Target/X86/X86FastISel.cpp @@ -2155,8 +2155,8 @@ // Choose the SSE instruction sequence based on data type (float or double). static const uint16_t OpcTable[2][4] = { - { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, - { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr } + { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr }, + { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr } }; const uint16_t *Opc = nullptr; @@ -2236,14 +2236,18 @@ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg); } else { + const TargetRegisterClass *VR128 = &X86::VR128RegClass; unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); - unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + unsigned AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false, LHSReg, LHSIsKill); - unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + unsigned AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true, RHSReg, RHSIsKill); - ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, - AndReg, /*IsKill=*/true); + unsigned OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true, + AndReg, /*IsKill=*/true); + ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg); } updateValueMap(I, ResultReg); return true; Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -624,15 +624,6 @@ def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; -// These are needed to match a scalar load that is used in a vector-only -// math instruction such as the FP logical ops: andps, andnps, orps, xorps. -// The memory operand is required to be a 128-bit load, so it must be converted -// from a vector to a scalar. -def loadf32_128 : PatFrag<(ops node:$ptr), - (f32 (extractelt (loadv4f32 node:$ptr), (iPTR 0)))>; -def loadf64_128 : PatFrag<(ops node:$ptr), - (f64 (extractelt (loadv2f64 node:$ptr), (iPTR 0)))>; - // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -8365,16 +8365,12 @@ { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, - { X86::FsANDNPSrr, X86::FsANDNPDrr,X86::PANDNrr }, { X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm }, { X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr }, - { X86::FsANDPSrr, X86::FsANDPDrr, X86::PANDrr }, { X86::ORPSrm, X86::ORPDrm, X86::PORrm }, { X86::ORPSrr, X86::ORPDrr, X86::PORrr }, - { X86::FsORPSrr, X86::FsORPDrr, X86::PORrr }, { X86::XORPSrm, X86::XORPDrm, X86::PXORrm }, { X86::XORPSrr, X86::XORPDrr, X86::PXORrr }, - { X86::FsXORPSrr, X86::FsXORPDrr, X86::PXORrr }, // AVX 128-bit support { X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr }, { X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm }, @@ -8385,16 +8381,12 @@ { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr }, { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm }, { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr }, - { X86::VFsANDNPSrr,X86::VFsANDNPDrr,X86::VPANDNrr }, { X86::VANDPSrm, X86::VANDPDrm, X86::VPANDrm }, { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr }, - { X86::VFsANDPSrr, X86::VFsANDPDrr, X86::VPANDrr }, { X86::VORPSrm, X86::VORPDrm, X86::VPORrm }, { X86::VORPSrr, X86::VORPDrr, X86::VPORrr }, - { X86::VFsORPSrr, X86::VFsORPDrr, X86::VPORrr }, { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm }, { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr }, - { X86::VFsXORPSrr, X86::VFsXORPDrr, X86::VPXORrr }, // AVX 256-bit support { X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr }, { X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm }, Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -2777,39 +2777,6 @@ // SSE 1 & 2 - Logical Instructions //===----------------------------------------------------------------------===// -// Multiclass for scalars using the X86 logical operation aliases for FP. -multiclass sse12_fp_packed_scalar_logical_alias< - bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { - defm V#NAME#PS : sse12_fp_packed, - PS, VEX_4V; - - defm V#NAME#PD : sse12_fp_packed, - PD, VEX_4V; - - let Constraints = "$src1 = $dst" in { - defm PS : sse12_fp_packed, PS; - - defm PD : sse12_fp_packed, PD; - } -} - -let isCodeGenOnly = 1 in { - defm FsAND : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand, - SSE_BIT_ITINS_P>; - defm FsOR : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for, - SSE_BIT_ITINS_P>; - defm FsXOR : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor, - SSE_BIT_ITINS_P>; - - let isCommutable = 0 in - defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn, - SSE_BIT_ITINS_P>; -} - /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops /// multiclass sse12_fp_packed_logical opc, string OpcodeStr, @@ -2965,6 +2932,43 @@ (VANDNPDYrm VR256:$src1, addr:$src2)>; } +let Predicates = [HasAVX] in { + // Use packed logical operations for scalar ops. + def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (VANDPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (VORPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (VXORPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (VANDNPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + + def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (VANDPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (VORPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (VXORPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (VANDNPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; +} + let Predicates = [UseSSE1] in { def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), (ANDPSrr VR128:$src1, VR128:$src2)>; @@ -2983,6 +2987,24 @@ (XORPSrm VR128:$src1, addr:$src2)>; def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), (ANDNPSrm VR128:$src1, addr:$src2)>; + + // Use packed logical operations for scalar ops. + def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (ANDPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (ORPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (XORPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (ANDNPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; } let Predicates = [UseSSE2] in { @@ -3003,6 +3025,24 @@ (XORPDrm VR128:$src1, addr:$src2)>; def : Pat<(X86fandn VR128:$src1, (memopv2f64 addr:$src2)), (ANDNPDrm VR128:$src1, addr:$src2)>; + + // Use packed logical operations for scalar ops. + def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (ANDPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (ORPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (XORPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (ANDNPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; } //===----------------------------------------------------------------------===// Index: llvm/trunk/test/CodeGen/X86/sqrt-fastmath-mir.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sqrt-fastmath-mir.ll +++ llvm/trunk/test/CodeGen/X86/sqrt-fastmath-mir.ll @@ -17,10 +17,10 @@ ; CHECK: %10 = VFMADD213SSr %8, %9, %4 ; CHECK: %11 = VMULSSrr %9, %6 ; CHECK: %12 = VMULSSrr killed %11, killed %10 -; CHECK: %13 = FsFLD0SS -; CHECK: %14 = VCMPSSrr %0, killed %13, 0 -; CHECK: %15 = VFsANDNPSrr killed %14, killed %12 -; CHECK: %xmm0 = COPY %15 +; CHECK: %14 = FsFLD0SS +; CHECK: %15 = VCMPSSrr %0, killed %14, 0 +; CHECK: %17 = VANDNPSrr killed %16, killed %13 +; CHECK: %xmm0 = COPY %18 ; CHECK: RET 0, %xmm0 %call = tail call float @llvm.sqrt.f32(float %f) #1 ret float %call