Index: llvm/lib/CodeGen/TargetInstrInfo.cpp =================================================================== --- llvm/lib/CodeGen/TargetInstrInfo.cpp +++ llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -699,10 +699,13 @@ std::swap(MI1, MI2); // 1. The previous instruction must be the same type as Inst. - // 2. The previous instruction must have virtual register definitions for its + // 2. The previous instruction must also be associative/commutative (this can + // be different even for instructions with the same opcode if traits like + // fast-math-flags are included). + // 3. The previous instruction must have virtual register definitions for its // operands in the same basic block as Inst. - // 3. The previous instruction's result must only be used by Inst. - return MI1->getOpcode() == AssocOpcode && + // 4. The previous instruction's result must only be used by Inst. + return MI1->getOpcode() == AssocOpcode && isAssociativeAndCommutative(*MI1) && hasReassociableOperands(*MI1, MBB) && MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()); } Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -7657,7 +7657,8 @@ case X86::VMULSSrr: case X86::VMULSDZrr: case X86::VMULSSZrr: - return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; + return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && + Inst.getFlag(MachineInstr::MIFlag::FmNsz); default: return false; } @@ -7843,6 +7844,20 @@ MachineInstr &OldMI2, MachineInstr &NewMI1, MachineInstr &NewMI2) const { + // Propagate FP flags from the original instructions. + // But clear poison-generating flags because those may not be valid now. + // TODO: There should be a helper function for copying only fast-math-flags. + uint16_t IntersectedFlags = OldMI1.getFlags() & OldMI2.getFlags(); + NewMI1.setFlags(IntersectedFlags); + NewMI1.clearFlag(MachineInstr::MIFlag::NoSWrap); + NewMI1.clearFlag(MachineInstr::MIFlag::NoUWrap); + NewMI1.clearFlag(MachineInstr::MIFlag::IsExact); + + NewMI2.setFlags(IntersectedFlags); + NewMI2.clearFlag(MachineInstr::MIFlag::NoSWrap); + NewMI2.clearFlag(MachineInstr::MIFlag::NoUWrap); + NewMI2.clearFlag(MachineInstr::MIFlag::IsExact); + // Integer instructions may define an implicit EFLAGS dest register operand. MachineOperand *OldFlagDef1 = OldMI1.findRegisterDefOperand(X86::EFLAGS); MachineOperand *OldFlagDef2 = OldMI2.findRegisterDefOperand(X86::EFLAGS); Index: llvm/test/CodeGen/X86/fmf-flags.ll =================================================================== --- llvm/test/CodeGen/X86/fmf-flags.ll +++ llvm/test/CodeGen/X86/fmf-flags.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s -check-prefix=X64 ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s -check-prefix=X86 declare float @llvm.sqrt.f32(float %x); Index: llvm/test/CodeGen/X86/machine-combiner.ll =================================================================== --- llvm/test/CodeGen/X86/machine-combiner.ll +++ llvm/test/CodeGen/X86/machine-combiner.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=SSE -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512vl -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512vl -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefixes=AVX,AVX512 ; Incremental updates of the instruction depths should be enough for this test ; case. -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=SSE -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512vl -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=sse -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=avx -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=avx512vl -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefixes=AVX,AVX512 ; Verify that the first two adds are independent regardless of how the inputs are ; commuted. The destination registers are used as source registers for the third add. @@ -26,9 +26,9 @@ ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %t0, %x2 - %t2 = fadd float %t1, %x3 + %t0 = fadd reassoc nsz float %x0, %x1 + %t1 = fadd reassoc nsz float %t0, %x2 + %t2 = fadd reassoc nsz float %t1, %x3 ret float %t2 } @@ -46,9 +46,9 @@ ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %x2, %t0 - %t2 = fadd float %t1, %x3 + %t0 = fadd reassoc nsz float %x0, %x1 + %t1 = fadd reassoc nsz float %x2, %t0 + %t2 = fadd reassoc nsz float %t1, %x3 ret float %t2 } @@ -66,9 +66,9 @@ ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %t0, %x2 - %t2 = fadd float %x3, %t1 + %t0 = fadd reassoc nsz float %x0, %x1 + %t1 = fadd reassoc nsz float %t0, %x2 + %t2 = fadd reassoc nsz float %x3, %t1 ret float %t2 } @@ -86,9 +86,9 @@ ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %x2, %t0 - %t2 = fadd float %x3, %t1 + %t0 = fadd reassoc nsz float %x0, %x1 + %t1 = fadd reassoc nsz float %x2, %t0 + %t2 = fadd reassoc nsz float %x3, %t1 ret float %t2 } @@ -117,13 +117,13 @@ ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddss %xmm7, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %t0, %x2 - %t2 = fadd float %t1, %x3 - %t3 = fadd float %t2, %x4 - %t4 = fadd float %t3, %x5 - %t5 = fadd float %t4, %x6 - %t6 = fadd float %t5, %x7 + %t0 = fadd reassoc nsz float %x0, %x1 + %t1 = fadd reassoc nsz float %t0, %x2 + %t2 = fadd reassoc nsz float %t1, %x3 + %t3 = fadd reassoc nsz float %t2, %x4 + %t4 = fadd reassoc nsz float %t3, %x5 + %t5 = fadd reassoc nsz float %t4, %x6 + %t6 = fadd reassoc nsz float %t5, %x7 ret float %t6 } @@ -146,9 +146,9 @@ ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv float %x0, %x1 - %t1 = fadd float %x2, %t0 - %t2 = fadd float %x3, %t1 + %t0 = fdiv reassoc nsz float %x0, %x1 + %t1 = fadd reassoc nsz float %x2, %t0 + %t2 = fadd reassoc nsz float %x3, %t1 ret float %t2 } @@ -168,9 +168,9 @@ ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv float %x0, %x1 - %t1 = fmul float %x2, %t0 - %t2 = fmul float %x3, %t1 + %t0 = fdiv reassoc nsz float %x0, %x1 + %t1 = fmul reassoc nsz float %x2, %t0 + %t2 = fmul reassoc nsz float %x3, %t1 ret float %t2 } @@ -190,9 +190,9 @@ ; AVX-NEXT: vaddsd %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv double %x0, %x1 - %t1 = fadd double %x2, %t0 - %t2 = fadd double %x3, %t1 + %t0 = fdiv reassoc nsz double %x0, %x1 + %t1 = fadd reassoc nsz double %x2, %t0 + %t2 = fadd reassoc nsz double %x3, %t1 ret double %t2 } @@ -212,9 +212,9 @@ ; AVX-NEXT: vmulsd %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv double %x0, %x1 - %t1 = fmul double %x2, %t0 - %t2 = fmul double %x3, %t1 + %t0 = fdiv reassoc nsz double %x0, %x1 + %t1 = fmul reassoc nsz double %x2, %t0 + %t2 = fmul reassoc nsz double %x3, %t1 ret double %t2 } @@ -240,9 +240,9 @@ ; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; AVX512-NEXT: retq - %t0 = fmul <4 x float> %x0, %x1 - %t1 = fadd <4 x float> %x2, %t0 - %t2 = fadd <4 x float> %x3, %t1 + %t0 = fmul reassoc nsz <4 x float> %x0, %x1 + %t1 = fadd reassoc nsz <4 x float> %x2, %t0 + %t2 = fadd reassoc nsz <4 x float> %x3, %t1 ret <4 x float> %t2 } @@ -268,9 +268,9 @@ ; AVX512-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX512-NEXT: retq - %t0 = fmul <2 x double> %x0, %x1 - %t1 = fadd <2 x double> %x2, %t0 - %t2 = fadd <2 x double> %x3, %t1 + %t0 = fmul reassoc nsz <2 x double> %x0, %x1 + %t1 = fadd reassoc nsz <2 x double> %x2, %t0 + %t2 = fadd reassoc nsz <2 x double> %x3, %t1 ret <2 x double> %t2 } @@ -290,9 +290,9 @@ ; AVX-NEXT: vmulps %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd <4 x float> %x0, %x1 - %t1 = fmul <4 x float> %x2, %t0 - %t2 = fmul <4 x float> %x3, %t1 + %t0 = fadd reassoc nsz <4 x float> %x0, %x1 + %t1 = fmul reassoc nsz <4 x float> %x2, %t0 + %t2 = fmul reassoc nsz <4 x float> %x3, %t1 ret <4 x float> %t2 } @@ -312,9 +312,9 @@ ; AVX-NEXT: vmulpd %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd <2 x double> %x0, %x1 - %t1 = fmul <2 x double> %x2, %t0 - %t2 = fmul <2 x double> %x3, %t1 + %t0 = fadd reassoc nsz <2 x double> %x0, %x1 + %t1 = fmul reassoc nsz <2 x double> %x2, %t0 + %t2 = fmul reassoc nsz <2 x double> %x3, %t1 ret <2 x double> %t2 } @@ -343,9 +343,9 @@ ; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; AVX512-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; AVX512-NEXT: retq - %t0 = fmul <8 x float> %x0, %x1 - %t1 = fadd <8 x float> %x2, %t0 - %t2 = fadd <8 x float> %x3, %t1 + %t0 = fmul reassoc nsz <8 x float> %x0, %x1 + %t1 = fadd reassoc nsz <8 x float> %x2, %t0 + %t2 = fadd reassoc nsz <8 x float> %x3, %t1 ret <8 x float> %t2 } @@ -374,9 +374,9 @@ ; AVX512-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; AVX512-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ; AVX512-NEXT: retq - %t0 = fmul <4 x double> %x0, %x1 - %t1 = fadd <4 x double> %x2, %t0 - %t2 = fadd <4 x double> %x3, %t1 + %t0 = fmul reassoc nsz <4 x double> %x0, %x1 + %t1 = fadd reassoc nsz <4 x double> %x2, %t0 + %t2 = fadd reassoc nsz <4 x double> %x3, %t1 ret <4 x double> %t2 } @@ -399,9 +399,9 @@ ; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm1 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq - %t0 = fadd <8 x float> %x0, %x1 - %t1 = fmul <8 x float> %x2, %t0 - %t2 = fmul <8 x float> %x3, %t1 + %t0 = fadd reassoc nsz <8 x float> %x0, %x1 + %t1 = fmul reassoc nsz <8 x float> %x2, %t0 + %t2 = fmul reassoc nsz <8 x float> %x3, %t1 ret <8 x float> %t2 } @@ -424,9 +424,9 @@ ; AVX-NEXT: vmulpd %ymm3, %ymm2, %ymm1 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq - %t0 = fadd <4 x double> %x0, %x1 - %t1 = fmul <4 x double> %x2, %t0 - %t2 = fmul <4 x double> %x3, %t1 + %t0 = fadd reassoc nsz <4 x double> %x0, %x1 + %t1 = fmul reassoc nsz <4 x double> %x2, %t0 + %t2 = fmul reassoc nsz <4 x double> %x3, %t1 ret <4 x double> %t2 } @@ -464,9 +464,9 @@ ; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 ; AVX512-NEXT: vaddps %zmm0, %zmm3, %zmm0 ; AVX512-NEXT: retq - %t0 = fmul <16 x float> %x0, %x1 - %t1 = fadd <16 x float> %x2, %t0 - %t2 = fadd <16 x float> %x3, %t1 + %t0 = fmul reassoc nsz <16 x float> %x0, %x1 + %t1 = fadd reassoc nsz <16 x float> %x2, %t0 + %t2 = fadd reassoc nsz <16 x float> %x3, %t1 ret <16 x float> %t2 } @@ -504,9 +504,9 @@ ; AVX512-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 ; AVX512-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; AVX512-NEXT: retq - %t0 = fmul <8 x double> %x0, %x1 - %t1 = fadd <8 x double> %x2, %t0 - %t2 = fadd <8 x double> %x3, %t1 + %t0 = fmul reassoc nsz <8 x double> %x0, %x1 + %t1 = fadd reassoc nsz <8 x double> %x2, %t0 + %t2 = fadd reassoc nsz <8 x double> %x3, %t1 ret <8 x double> %t2 } @@ -545,9 +545,9 @@ ; AVX512-NEXT: vmulps %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq - %t0 = fadd <16 x float> %x0, %x1 - %t1 = fmul <16 x float> %x2, %t0 - %t2 = fmul <16 x float> %x3, %t1 + %t0 = fadd reassoc nsz <16 x float> %x0, %x1 + %t1 = fmul reassoc nsz <16 x float> %x2, %t0 + %t2 = fmul reassoc nsz <16 x float> %x3, %t1 ret <16 x float> %t2 } @@ -586,9 +586,9 @@ ; AVX512-NEXT: vmulpd %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq - %t0 = fadd <8 x double> %x0, %x1 - %t1 = fmul <8 x double> %x2, %t0 - %t2 = fmul <8 x double> %x3, %t1 + %t0 = fadd reassoc nsz <8 x double> %x0, %x1 + %t1 = fmul reassoc nsz <8 x double> %x2, %t0 + %t2 = fmul reassoc nsz <8 x double> %x3, %t1 ret <8 x double> %t2 } @@ -1114,9 +1114,9 @@ %x1 = call double @bar() %x2 = call double @bar() %x3 = call double @bar() - %t0 = fadd double %x0, %x1 - %t1 = fadd double %t0, %x2 - %t2 = fadd double %t1, %x3 + %t0 = fadd reassoc nsz double %x0, %x1 + %t1 = fadd reassoc nsz double %t0, %x2 + %t2 = fadd reassoc nsz double %t1, %x3 ret double %t2 } @@ -1165,9 +1165,9 @@ %x1 = call double @bar() %x2 = call double @bar() %x3 = call double @bar() - %t0 = fadd double %x0, %x1 - %t1 = fadd double %x2, %x3 - %t2 = fadd double %t0, %t1 + %t0 = fadd reassoc nsz double %x0, %x1 + %t1 = fadd reassoc nsz double %x2, %x3 + %t2 = fadd reassoc nsz double %t0, %t1 ret double %t2 } Index: llvm/test/CodeGen/X86/pow.ll =================================================================== --- llvm/test/CodeGen/X86/pow.ll +++ llvm/test/CodeGen/X86/pow.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s declare float @llvm.pow.f32(float, float) declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>) Index: llvm/test/CodeGen/X86/sqrt-fastmath.ll =================================================================== --- llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 declare double @__sqrt_finite(double) declare float @__sqrtf_finite(float) @@ -135,8 +135,8 @@ ; SSE-NEXT: mulss %xmm2, %xmm3 ; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: addss {{.*}}(%rip), %xmm2 -; SSE-NEXT: mulss %xmm3, %xmm2 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: mulss %xmm3, %xmm2 ; SSE-NEXT: cmpltss {{.*}}(%rip), %xmm0 ; SSE-NEXT: andnps %xmm2, %xmm0 ; SSE-NEXT: retq @@ -148,8 +148,8 @@ ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -182,8 +182,8 @@ ; SSE-NEXT: mulps %xmm1, %xmm3 ; SSE-NEXT: mulps %xmm2, %xmm1 ; SSE-NEXT: addps {{.*}}(%rip), %xmm1 -; SSE-NEXT: mulps %xmm3, %xmm1 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: mulps %xmm3, %xmm1 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; SSE-NEXT: cmpleps %xmm0, %xmm2 ; SSE-NEXT: andps %xmm2, %xmm1 @@ -197,8 +197,8 @@ ; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm3 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -211,8 +211,8 @@ ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] ; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] @@ -246,20 +246,18 @@ ; SSE-LABEL: f32_estimate: ; SSE: # %bb.0: ; SSE-NEXT: rsqrtss %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: mulss %xmm1, %xmm2 -; SSE-NEXT: mulss %xmm0, %xmm2 -; SSE-NEXT: addss {{.*}}(%rip), %xmm2 +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: addss {{.*}}(%rip), %xmm0 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 -; SSE-NEXT: mulss %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: f32_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 @@ -308,20 +306,18 @@ ; SSE-LABEL: v4f32_estimate: ; SSE: # %bb.0: ; SSE-NEXT: rsqrtps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: mulps %xmm0, %xmm2 -; SSE-NEXT: addps {{.*}}(%rip), %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: addps {{.*}}(%rip), %xmm0 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: v4f32_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 -; AVX1-NEXT: vmulps %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulps %xmm0, %xmm1, %xmm0 @@ -334,7 +330,7 @@ ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; AVX512-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) @@ -374,31 +370,27 @@ define <8 x float> @v8f32_estimate(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_estimate: ; SSE: # %bb.0: -; SSE-NEXT: rsqrtps %xmm0, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: mulps %xmm3, %xmm2 -; SSE-NEXT: mulps %xmm0, %xmm2 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm4, %xmm2 +; SSE-NEXT: rsqrtps %xmm0, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: mulps %xmm2, %xmm0 ; SSE-NEXT: mulps %xmm3, %xmm2 -; SSE-NEXT: rsqrtps %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: mulps %xmm5, %xmm3 -; SSE-NEXT: mulps %xmm1, %xmm3 -; SSE-NEXT: addps %xmm0, %xmm3 -; SSE-NEXT: mulps %xmm4, %xmm3 -; SSE-NEXT: mulps %xmm5, %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; SSE-NEXT: addps %xmm4, %xmm0 +; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: rsqrtps %xmm1, %xmm2 +; SSE-NEXT: mulps %xmm2, %xmm3 +; SSE-NEXT: mulps %xmm2, %xmm1 +; SSE-NEXT: mulps %xmm2, %xmm1 +; SSE-NEXT: addps %xmm4, %xmm1 +; SSE-NEXT: mulps %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: v8f32_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtps %ymm0, %ymm1 -; AVX1-NEXT: vmulps %ymm1, %ymm1, %ymm2 -; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vmulps %ymm0, %ymm1, %ymm0 @@ -411,7 +403,7 @@ ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm0 +; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm1 ; AVX512-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) @@ -459,58 +451,51 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 { ; SSE-LABEL: v16f32_estimate: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: rsqrtps %xmm0, %xmm5 -; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm7 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; SSE-NEXT: addps %xmm7, %xmm0 -; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: rsqrtps %xmm4, %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: mulps %xmm4, %xmm1 -; SSE-NEXT: addps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm4, %xmm6 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; SSE-NEXT: addps %xmm5, %xmm0 +; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: rsqrtps %xmm1, %xmm6 ; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: rsqrtps %xmm2, %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: mulps %xmm5, %xmm4 -; SSE-NEXT: mulps %xmm2, %xmm4 -; SSE-NEXT: addps %xmm7, %xmm4 +; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: mulps %xmm4, %xmm6 +; SSE-NEXT: addps %xmm5, %xmm1 +; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: rsqrtps %xmm2, %xmm6 +; SSE-NEXT: mulps %xmm6, %xmm2 +; SSE-NEXT: mulps %xmm6, %xmm2 +; SSE-NEXT: mulps %xmm4, %xmm6 +; SSE-NEXT: addps %xmm5, %xmm2 +; SSE-NEXT: mulps %xmm6, %xmm2 +; SSE-NEXT: rsqrtps %xmm3, %xmm6 ; SSE-NEXT: mulps %xmm6, %xmm4 -; SSE-NEXT: mulps %xmm5, %xmm4 -; SSE-NEXT: rsqrtps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: mulps %xmm2, %xmm5 -; SSE-NEXT: mulps %xmm3, %xmm5 -; SSE-NEXT: addps %xmm7, %xmm5 -; SSE-NEXT: mulps %xmm6, %xmm5 -; SSE-NEXT: mulps %xmm2, %xmm5 -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: mulps %xmm6, %xmm3 +; SSE-NEXT: mulps %xmm6, %xmm3 +; SSE-NEXT: addps %xmm5, %xmm3 +; SSE-NEXT: mulps %xmm4, %xmm3 ; SSE-NEXT: retq ; ; AVX1-LABEL: v16f32_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtps %ymm0, %ymm2 ; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; AVX1-NEXT: vmulps %ymm2, %ymm2, %ymm4 -; AVX1-NEXT: vmulps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vmulps %ymm0, %ymm3, %ymm0 -; AVX1-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vrsqrtps %ymm1, %ymm2 -; AVX1-NEXT: vmulps %ymm2, %ymm2, %ymm5 -; AVX1-NEXT: vmulps %ymm5, %ymm1, %ymm1 -; AVX1-NEXT: vaddps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vrsqrtps %ymm1, %ymm4 +; AVX1-NEXT: vmulps %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmulps %ymm1, %ymm3, %ymm1 -; AVX1-NEXT: vmulps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: v16f32_estimate: Index: llvm/test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -5758,15 +5758,15 @@ ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; SSE2-NEXT: subpd %xmm6, %xmm0 +; SSE2-NEXT: addpd %xmm3, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: psrlq $32, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: subpd %xmm6, %xmm1 -; SSE2-NEXT: movapd {{.*#+}} xmm4 = [5.0E-1,5.0E-1] -; SSE2-NEXT: addpd %xmm4, %xmm0 -; SSE2-NEXT: addpd %xmm3, %xmm0 -; SSE2-NEXT: addpd %xmm4, %xmm1 +; SSE2-NEXT: addpd %xmm2, %xmm1 +; SSE2-NEXT: movapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; SSE2-NEXT: addpd %xmm2, %xmm0 ; SSE2-NEXT: addpd %xmm2, %xmm1 ; SSE2-NEXT: movupd %xmm0, (%rdi) ; SSE2-NEXT: movupd %xmm1, 16(%rdi) @@ -5786,15 +5786,15 @@ ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; SSE41-NEXT: subpd %xmm6, %xmm0 +; SSE41-NEXT: addpd %xmm3, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; SSE41-NEXT: por %xmm4, %xmm2 ; SSE41-NEXT: psrlq $32, %xmm1 ; SSE41-NEXT: por %xmm5, %xmm1 ; SSE41-NEXT: subpd %xmm6, %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [5.0E-1,5.0E-1] -; SSE41-NEXT: addpd %xmm4, %xmm0 -; SSE41-NEXT: addpd %xmm3, %xmm0 -; SSE41-NEXT: addpd %xmm4, %xmm1 +; SSE41-NEXT: addpd %xmm2, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; SSE41-NEXT: addpd %xmm2, %xmm0 ; SSE41-NEXT: addpd %xmm2, %xmm1 ; SSE41-NEXT: movupd %xmm0, (%rdi) ; SSE41-NEXT: movupd %xmm1, 16(%rdi) @@ -5812,16 +5812,16 @@ ; AVX1-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX1-NEXT: vsubpd %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vsubpd %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [5.0E-1,5.0E-1] -; AVX1-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vaddpd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vaddpd %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vaddpd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX1-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovupd %xmm0, (%rdi) ; AVX1-NEXT: vmovupd %xmm1, 16(%rdi) ; AVX1-NEXT: retq @@ -5838,16 +5838,16 @@ ; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX2-NEXT: vsubpd %xmm6, %xmm0, %xmm0 +; AVX2-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX2-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm1 ; AVX2-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX2-NEXT: vsubpd %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [5.0E-1,5.0E-1] -; AVX2-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vaddpd %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vaddpd %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vaddpd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX2-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovupd %xmm0, (%rdi) ; AVX2-NEXT: vmovupd %xmm1, 16(%rdi) ; AVX2-NEXT: retq @@ -5864,16 +5864,16 @@ ; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX512F-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX512F-NEXT: vsubpd %xmm6, %xmm0, %xmm0 +; AVX512F-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512F-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlq $32, %xmm1, %xmm1 ; AVX512F-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX512F-NEXT: vsubpd %xmm6, %xmm1, %xmm1 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm4 = [5.0E-1,5.0E-1] -; AVX512F-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; AVX512F-NEXT: vaddpd %xmm0, %xmm3, %xmm0 -; AVX512F-NEXT: vaddpd %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vaddpd %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512F-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vmovupd %xmm0, (%rdi) ; AVX512F-NEXT: vmovupd %xmm1, 16(%rdi) ; AVX512F-NEXT: retq @@ -5890,16 +5890,16 @@ ; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX512VL-NEXT: vsubpd %xmm6, %xmm0, %xmm0 +; AVX512VL-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm2 ; AVX512VL-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm1 ; AVX512VL-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX512VL-NEXT: vsubpd %xmm6, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovapd {{.*#+}} xmm4 = [5.0E-1,5.0E-1] -; AVX512VL-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; AVX512VL-NEXT: vaddpd %xmm0, %xmm3, %xmm0 -; AVX512VL-NEXT: vaddpd %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vaddpd %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512VL-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovupd %xmm0, (%rdi) ; AVX512VL-NEXT: vmovupd %xmm1, 16(%rdi) ; AVX512VL-NEXT: retq Index: llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL ; ; vXf32 (accum) Index: llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL ; ; vXf32 (accum)