Index: lib/Target/X86/X86MCInstLower.cpp =================================================================== --- lib/Target/X86/X86MCInstLower.cpp +++ lib/Target/X86/X86MCInstLower.cpp @@ -1826,6 +1826,205 @@ } } break; + case X86::VBROADCASTSSrm: + case X86::VBROADCASTSSYrm: + case X86::VBROADCASTSSZ128m: + case X86::VBROADCASTSSZ256m: + case X86::VBROADCASTSSZm: + case X86::VBROADCASTSDYrm: + case X86::VBROADCASTSDZ256m: + case X86::VBROADCASTSDZm: + case X86::VPBROADCASTBrm: + case X86::VPBROADCASTBYrm: + case X86::VPBROADCASTBZ128m: + case X86::VPBROADCASTBZ256m: + case X86::VPBROADCASTBZm: + case X86::VPBROADCASTDrm: + case X86::VPBROADCASTDYrm: + case X86::VPBROADCASTDZ128m: + case X86::VPBROADCASTDZ256m: + case X86::VPBROADCASTDZm: + case X86::VPBROADCASTQrm: + case X86::VPBROADCASTQYrm: + case X86::VPBROADCASTQZ128m: + case X86::VPBROADCASTQZ256m: + case X86::VPBROADCASTQZm: + case X86::VPBROADCASTWrm: + case X86::VPBROADCASTWYrm: + case X86::VPBROADCASTWZ128m: + case X86::VPBROADCASTWZ256m: + case X86::VPBROADCASTWZm: + if (!OutStreamer->isVerboseAsm()) + break; + if (MI->getNumOperands() <= 4) + break; + if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) { + int NumElts; + switch (MI->getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VBROADCASTSSrm: NumElts = 4; break; + case X86::VBROADCASTSSYrm: NumElts = 8; break; + case X86::VBROADCASTSSZ128m: NumElts = 4; break; + case X86::VBROADCASTSSZ256m: NumElts = 8; break; + case X86::VBROADCASTSSZm: NumElts = 16; break; + case X86::VBROADCASTSDYrm: NumElts = 4; break; + case X86::VBROADCASTSDZ256m: NumElts = 4; break; + case X86::VBROADCASTSDZm: NumElts = 8; break; + case X86::VPBROADCASTBrm: NumElts = 16; break; + case X86::VPBROADCASTBYrm: NumElts = 32; break; + case X86::VPBROADCASTBZ128m: NumElts = 16; break; + case X86::VPBROADCASTBZ256m: NumElts = 32; break; + case X86::VPBROADCASTBZm: NumElts = 64; break; + case X86::VPBROADCASTDrm: NumElts = 4; break; + case X86::VPBROADCASTDYrm: NumElts = 8; break; + case X86::VPBROADCASTDZ128m: NumElts = 4; break; + case X86::VPBROADCASTDZ256m: NumElts = 8; break; + case X86::VPBROADCASTDZm: NumElts = 16; break; + case X86::VPBROADCASTQrm: NumElts = 2; break; + case X86::VPBROADCASTQYrm: NumElts = 4; break; + case X86::VPBROADCASTQZ128m: NumElts = 2; break; + case X86::VPBROADCASTQZ256m: NumElts = 4; break; + case X86::VPBROADCASTQZm: NumElts = 8; break; + case X86::VPBROADCASTWrm: NumElts = 8; break; + case X86::VPBROADCASTWYrm: NumElts = 16; break; + case X86::VPBROADCASTWZ128m: NumElts = 8; break; + case X86::VPBROADCASTWZ256m: NumElts = 16; break; + case X86::VPBROADCASTWZm: NumElts = 32; break; + } + + std::string Comment; + raw_string_ostream CS(Comment); + const MachineOperand &DstOp = MI->getOperand(0); + CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = "; + CS << "["; + for (int i = 0; i != NumElts; ++i) { + if (i != 0) + CS << ","; + if (isa(C)) { + CS << "u"; + } else if (auto *CI = dyn_cast(C)) { + if (CI->getBitWidth() <= 64) { + CS << CI->getZExtValue(); + } else { + // print multi-word constant as (w0,w1) + const auto &Val = CI->getValue(); + CS << "("; + for (int i = 0, N = Val.getNumWords(); i < N; ++i) { + if (i > 0) + CS << ","; + CS << Val.getRawData()[i]; + } + CS << ")"; + } + } else if (auto *CF = dyn_cast(C)) { + SmallString<32> Str; + CF->getValueAPF().toString(Str); + CS << Str; + } else { + CS << "?"; + } + } + CS << "]"; + OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); + } + break; + case X86::VBROADCASTF128: + case X86::VBROADCASTI128: + case X86::VBROADCASTF32X4Z256rm: + case X86::VBROADCASTF32X4rm: + case X86::VBROADCASTF32X8rm: + case X86::VBROADCASTF64X2Z128rm: + case X86::VBROADCASTF64X2rm: + case X86::VBROADCASTF64X4rm: + case X86::VBROADCASTI32X4Z256rm: + case X86::VBROADCASTI32X4rm: + case X86::VBROADCASTI32X8rm: + case X86::VBROADCASTI64X2Z128rm: + case X86::VBROADCASTI64X2rm: + case X86::VBROADCASTI64X4rm: + if (!OutStreamer->isVerboseAsm()) + break; + if (MI->getNumOperands() <= 4) + break; + if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) { + int NumLanes; + switch (MI->getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VBROADCASTF128: NumLanes = 2; break; + case X86::VBROADCASTI128: NumLanes = 2; break; + case X86::VBROADCASTF32X4Z256rm: NumLanes = 2; break; + case X86::VBROADCASTF32X4rm: NumLanes = 4; break; + case X86::VBROADCASTF32X8rm: NumLanes = 2; break; + case X86::VBROADCASTF64X2Z128rm: NumLanes = 2; break; + case X86::VBROADCASTF64X2rm: NumLanes = 4; break; + case X86::VBROADCASTF64X4rm: NumLanes = 2; break; + case X86::VBROADCASTI32X4Z256rm: NumLanes = 2; break; + case X86::VBROADCASTI32X4rm: NumLanes = 4; break; + case X86::VBROADCASTI32X8rm: NumLanes = 2; break; + case X86::VBROADCASTI64X2Z128rm: NumLanes = 2; break; + case X86::VBROADCASTI64X2rm: NumLanes = 4; break; + case X86::VBROADCASTI64X4rm: NumLanes = 2; break; + } + + std::string Comment; + raw_string_ostream CS(Comment); + const MachineOperand &DstOp = MI->getOperand(0); + CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = "; + if (auto *CDS = dyn_cast(C)) { + CS << "["; + for (int l = 0; l != NumLanes; ++l) { + for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) { + if (i != 0 || l != 0) + CS << ","; + if (CDS->getElementType()->isIntegerTy()) + CS << CDS->getElementAsInteger(i); + else if (CDS->getElementType()->isFloatTy()) + CS << CDS->getElementAsFloat(i); + else if (CDS->getElementType()->isDoubleTy()) + CS << CDS->getElementAsDouble(i); + else + CS << "?"; + } + } + CS << "]"; + OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); + } else if (auto *CV = dyn_cast(C)) { + CS << "<"; + for (int l = 0; l != NumLanes; ++l) { + for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) { + if (i != 0 || l != 0) + CS << ","; + Constant *COp = CV->getOperand(i); + if (isa(COp)) { + CS << "u"; + } else if (auto *CI = dyn_cast(COp)) { + if (CI->getBitWidth() <= 64) { + CS << CI->getZExtValue(); + } else { + // print multi-word constant as (w0,w1) + const auto &Val = CI->getValue(); + CS << "("; + for (int i = 0, N = Val.getNumWords(); i < N; ++i) { + if (i > 0) + CS << ","; + CS << Val.getRawData()[i]; + } + CS << ")"; + } + } else if (auto *CF = dyn_cast(COp)) { + SmallString<32> Str; + CF->getValueAPF().toString(Str); + CS << Str; + } else { + CS << "?"; + } + } + } + CS << ">"; + OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); + } + } + break; } MCInst TmpInst; Index: test/CodeGen/X86/avx2-vbroadcast.ll =================================================================== --- test/CodeGen/X86/avx2-vbroadcast.ll +++ test/CodeGen/X86/avx2-vbroadcast.ll @@ -633,13 +633,13 @@ define <8 x i32> @V111(<8 x i32> %in) nounwind uwtable readnone ssp { ; X32-AVX2-LABEL: V111: ; X32-AVX2: ## BB#0: ## %entry -; X32-AVX2-NEXT: vpbroadcastd LCPI29_0, %ymm1 +; X32-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] ; X32-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; X32-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: V111: ; X64-AVX2: ## BB#0: ## %entry -; X64-AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] ; X64-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -660,13 +660,13 @@ define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp { ; X32-AVX2-LABEL: V113: ; X32-AVX2: ## BB#0: ## %entry -; X32-AVX2-NEXT: vbroadcastss LCPI30_0, %ymm1 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125] ; X32-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X32-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: V113: ; X64-AVX2: ## BB#0: ## %entry -; X64-AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125] ; X64-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -687,12 +687,12 @@ define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp { ; X32-LABEL: _e2: ; X32: ## BB#0: -; X32-NEXT: vbroadcastss LCPI31_0, %xmm0 +; X32-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125] ; X32-NEXT: retl ; ; X64-LABEL: _e2: ; X64: ## BB#0: -; X64-NEXT: vbroadcastss {{.*}}(%rip), %xmm0 +; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125] ; X64-NEXT: retq %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1 Index: test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll =================================================================== --- test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -1,13 +1,13 @@ -; NOTE: Assertions have been simpilfied MANUALLY after running utils/update_llc_test_checks.py -; Assertions for constant pools have been added MANUALLY. +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2 -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512 -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=AVX512BW -check-prefix=AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX2 -check-prefix=AVX2-64 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX512 -check-prefix=AVX512F-64 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=AVX512BW -check-prefix=AVX512 -check-prefix=AVX512BW-64 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL32 -check-prefix=AVX512 -check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL64 -check-prefix=NO-AVX512BW-64 -check-prefix=AVX2-64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL64 -check-prefix=NO-AVX512BW-64 -check-prefix=AVX512F-64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL64 -check-prefix=AVX512F-64 -check-prefix=AVX512BW-64 ;===-----------------------------------------------------------------------------=== ; This test checks the ability to recognize a cross element pattern of @@ -17,20 +17,31 @@ ; => broadcast of the constant vector ;===-----------------------------------------------------------------------------=== -; ALL: LCPI0 -; ALL-NEXT: .short 256 # 0x100 - define <16 x i8> @f16xi8_i16(<16 x i8> %a) { +; AVX-LABEL: f16xi8_i16: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f16xi8_i16: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256] ; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f16xi8_i16: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f16xi8_i16: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256] ; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq @@ -40,45 +51,48 @@ } -; ALL: .LCPI1 -; ALL-NEXT: .long 50462976 # 0x3020100 - -; AVX: .LCPI1 -; AVX-NEXT .long 50462976 # float 3.82047143E-37 - define <16 x i8> @f16xi8_i32(<16 x i8> %a) { +; AVX-LABEL: f16xi8_i32: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f16xi8_i32: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976] ; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f16xi8_i32: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f16xi8_i32: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976] ; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f16xi8_i32: -; AVX: # BB#0: -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 %res1 = add <16 x i8> , %a %res2 = and <16 x i8> , %res1 ret <16 x i8> %res2 } -; ALL64: .LCPI2 -; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100 - -; AVX: .LCPI2 -; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275 - define <16 x i8> @f16xi8_i64(<16 x i8> %a) { +; AVX-LABEL: f16xi8_i64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f16xi8_i64: ; ALL32: # BB#0: ; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] @@ -86,38 +100,56 @@ ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f16xi8_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f16xi8_i64: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [506097522914230528,506097522914230528] ; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f16xi8_i64: -; AVX: # BB#0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 %res1 = add <16 x i8> , %a %res2 = and <16 x i8> , %res1 ret <16 x i8> %res2 } -; ALL: .LCPI3 -; ALL-NEXT: .short 256 # 0x100 - define <32 x i8> @f32xi8_i16(<32 x i8> %a) { +; AVX-LABEL: f32xi8_i16: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f32xi8_i16: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm1 +; ALL32-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f32xi8_i16: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f32xi8_i16: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq @@ -127,155 +159,273 @@ } -; ALL: .LCPI4 -; ALL-NEXT: .long 50462976 # 0x3020100 - -; AVX: .LCPI4 -; AVX-NEXT: .long 50462976 # float 3.82047143E-37 - define <32 x i8> @f32xi8_i32(<32 x i8> %a) { +; AVX-LABEL: f32xi8_i32: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f32xi8_i32: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1 +; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f32xi8_i32: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f32xi8_i32: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f32xi8_i32: -; AVX: # BB#0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm2 -; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 %res1 = add <32 x i8> , %a %res2 = and <32 x i8> , %res1 ret <32 x i8> %res2 } -; ALL64: .LCPI5 -; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100 - -; AVX: .LCPI5 -; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275 - define <32 x i8> @f32xi8_i64(<32 x i8> %a) { +; AVX-LABEL: f32xi8_i64: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f32xi8_i64: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastq {{\.LCPI.*}}, %ymm1 +; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] ; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f32xi8_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f32xi8_i64: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] ; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f32xi8_i64: -; AVX: # BB#0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 %res1 = add <32 x i8> , %a %res2 = and <32 x i8> , %res1 ret <32 x i8> %res2 } -; ALL: .LCPI6 -; ALL-NEXT: .byte 0 # 0x0 -; ALL-NEXT: .byte 1 # 0x1 -; ALL-NEXT: .byte 2 # 0x2 -; ALL-NEXT: .byte 3 # 0x3 -; ALL-NEXT: .byte 4 # 0x4 -; ALL-NEXT: .byte 5 # 0x5 -; ALL-NEXT: .byte 6 # 0x6 -; ALL-NEXT: .byte 7 # 0x7 -; ALL-NEXT: .byte 8 # 0x8 -; ALL-NEXT: .byte 9 # 0x9 -; ALL-NEXT: .byte 10 # 0xa -; ALL-NEXT: .byte 11 # 0xb -; ALL-NEXT: .byte 12 # 0xc -; ALL-NEXT: .byte 13 # 0xd -; ALL-NEXT: .byte 14 # 0xe -; ALL-NEXT: .byte 15 # 0xf -; ALL-NOT: .byte - define <32 x i8> @f32xi8_i128(<32 x i8> %a) { -; ALL-LABEL: f32xi8_i128: -; ALL: # BB#0: -; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; ALL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX-LABEL: f32xi8_i128: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f32xi8_i128: +; ALL32: # BB#0: +; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; ALL32-NEXT: # ymm1 = mem[0,1,0,1] +; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f32xi8_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f32xi8_i128: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; ALL64-NEXT: # ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq %res1 = add <32 x i8> , %a %res2 = and <32 x i8> , %res1 ret <32 x i8> %res2 } -; ALL: .LCPI7 -; ALL-NEXT: .short 256 # 0x100 - define <64 x i8> @f64xi8_i16(<64 x i8> %a) { +; AVX-LABEL: f64xi8_i16: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f64xi8_i16: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl +; +; AVX-64-LABEL: f64xi8_i16: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f64xi8_i16: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f64xi8_i16: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq %res1 = add <64 x i8> , %a %res2 = and <64 x i8> , %res1 ret <64 x i8> %res2 } -; ALL: .LCPI8 -; ALL-NEXT: .long 50462976 # 0x3020100 - -; AVX: .LCPI8 -; AVX-NEXT: .long 50462976 # float 3.82047143E-37 - define <64 x i8> @f64i8_i32(<64 x i8> %a) { +; AVX-LABEL: f64i8_i32: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f64i8_i32: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f64i8_i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl ; -; AVX-LABEL: f64i8_i32: +; AVX-64-LABEL: f64i8_i32: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f64i8_i32: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f64i8_i32: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] +; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <64 x i8> , %a + %res2 = and <64 x i8> , %res1 + ret <64 x i8> %res2 +} + + +define <64 x i8> @f64xi8_i64(<64 x i8> %a) { +; AVX-LABEL: f64xi8_i64: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm3 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -283,43 +433,69 @@ ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 - %res1 = add <64 x i8> , %a - %res2 = and <64 x i8> , %res1 - ret <64 x i8> %res2 -} - - -; ALL64: .LCPI9 -; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100 - -; ALL32: .LCPI9 -; ALL32-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275 - -; AVX: .LCPI9 -; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275 - -define <64 x i8> @f64xi8_i64(<64 x i8> %a) { +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f64xi8_i64: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vpbroadcastq {{.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i64: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpbroadcastq {{.*}}, %zmm1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl ; -; AVX-LABEL: f64xi8_i64: +; AVX-64-LABEL: f64xi8_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f64xi8_i64: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f64xi8_i64: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528] +; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <64 x i8> , %a + %res2 = and <64 x i8> , %res1 + ret <64 x i8> %res2 +} + + +define <64 x i8> @f64xi8_i128(<64 x i8> %a) { +; AVX-LABEL: f64xi8_i128: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -327,143 +503,184 @@ ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 - %res1 = add <64 x i8> , %a - %res2 = and <64 x i8> , %res1 - ret <64 x i8> %res2 -} - - -; ALL: .LCPI10 -; ALL-NEXT: .byte 0 # 0x0 -; ALL-NEXT: .byte 1 # 0x1 -; ALL-NEXT: .byte 2 # 0x2 -; ALL-NEXT: .byte 3 # 0x3 -; ALL-NEXT: .byte 4 # 0x4 -; ALL-NEXT: .byte 5 # 0x5 -; ALL-NEXT: .byte 6 # 0x6 -; ALL-NEXT: .byte 7 # 0x7 -; ALL-NEXT: .byte 8 # 0x8 -; ALL-NEXT: .byte 9 # 0x9 -; ALL-NEXT: .byte 10 # 0xa -; ALL-NEXT: .byte 11 # 0xb -; ALL-NEXT: .byte 12 # 0xc -; ALL-NEXT: .byte 13 # 0xd -; ALL-NEXT: .byte 14 # 0xe -; ALL-NEXT: .byte 15 # 0xf -; ALL-NOT: .byte - -define <64 x i8> @f64xi8_i128(<64 x i8> %a) { +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f64xi8_i128: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i128: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl +; +; AVX-64-LABEL: f64xi8_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f64xi8_i128: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1] +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f64xi8_i128: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq %res1 = add <64 x i8> , %a %res2 = and <64 x i8> , %res1 ret <64 x i8> %res2 } -; AVX512BW: .LCPI11 -; AVX512BW-NEXT: .byte 0 # 0x0 -; AVX512BW-NEXT: .byte 1 # 0x1 -; AVX512BW-NEXT: .byte 2 # 0x2 -; AVX512BW-NEXT: .byte 3 # 0x3 -; AVX512BW-NEXT: .byte 4 # 0x4 -; AVX512BW-NEXT: .byte 5 # 0x5 -; AVX512BW-NEXT: .byte 6 # 0x6 -; AVX512BW-NEXT: .byte 7 # 0x7 -; AVX512BW-NEXT: .byte 8 # 0x8 -; AVX512BW-NEXT: .byte 9 # 0x9 -; AVX512BW-NEXT: .byte 10 # 0xa -; AVX512BW-NEXT: .byte 11 # 0xb -; AVX512BW-NEXT: .byte 12 # 0xc -; AVX512BW-NEXT: .byte 13 # 0xd -; AVX512BW-NEXT: .byte 14 # 0xe -; AVX512BW-NEXT: .byte 15 # 0xf -; AVX512BW-NEXT: .byte 16 # 0x10 -; AVX512BW-NEXT: .byte 17 # 0x11 -; AVX512BW-NEXT: .byte 18 # 0x12 -; AVX512BW-NEXT: .byte 19 # 0x13 -; AVX512BW-NEXT: .byte 20 # 0x14 -; AVX512BW-NEXT: .byte 21 # 0x15 -; AVX512BW-NEXT: .byte 22 # 0x16 -; AVX512BW-NEXT: .byte 23 # 0x17 -; AVX512BW-NEXT: .byte 24 # 0x18 -; AVX512BW-NEXT: .byte 25 # 0x19 -; AVX512BW-NEXT: .byte 26 # 0x1a -; AVX512BW-NEXT: .byte 27 # 0x1b -; AVX512BW-NEXT: .byte 28 # 0x1c -; AVX512BW-NEXT: .byte 29 # 0x1d -; AVX512BW-NEXT: .byte 30 # 0x1e -; AVX512BW-NEXT: .byte 31 # 0x1f -; AVX512BW-NOT: .byte - define <64 x i8> @f64xi8_i256(<64 x i8> %a) { +; AVX-LABEL: f64xi8_i256: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; +; NO-AVX512BW-LABEL: f64xi8_i256: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl +; ; AVX512BW-LABEL: f64xi8_i256: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl +; +; AVX-64-LABEL: f64xi8_i256: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f64xi8_i256: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f64xi8_i256: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq %res1 = add <64 x i8> , %a %res2 = and <64 x i8> , %res1 ret <64 x i8> %res2 } -; ALL: .LCPI12 -; ALL-NEXT: .long 65536 # 0x10000 - -; AVX: .LCPI12 -; AVX-NEXT: .long 65536 # float 9.18354962E-41 - define <8 x i16> @f8xi16_i32(<8 x i16> %a) { +; AVX-LABEL: f8xi16_i32: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f8xi16_i32: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536] ; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f8xi16_i32: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f8xi16_i32: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536] ; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f8xi16_i32: -; AVX: # BB#0: -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 %res1 = add <8 x i16> , %a %res2 = and <8 x i16> , %res1 ret <8 x i16> %res2 } -; ALL64: .LCPI13 -; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000 - -; ALL32: .LCPI13 -; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 - -; AVX: .LCPI13 -; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 - define <8 x i16> @f8xi16_i64(<8 x i16> %a) { +; AVX-LABEL: f8xi16_i64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f8xi16_i64: ; ALL32: # BB#0: ; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] @@ -471,67 +688,66 @@ ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f8xi16_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f8xi16_i64: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [844433520132096,844433520132096] ; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f8xi16_i64: -; AVX: # BB#0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 %res1 = add <8 x i16> , %a %res2 = and <8 x i16> , %res1 ret <8 x i16> %res2 } -; ALL: .LCPI14 -; ALL-NEXT: .long 65536 # 0x10000 - -; AVX: .LCPI14 -; AVX-NEXT: .long 65536 # float 9.18354962E-41 - define <16 x i16> @f16xi16_i32(<16 x i16> %a) { -; ALL-LABEL: f16xi16_i32: -; ALL: # BB#0: -; ALL-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1 -; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ; AVX-LABEL: f16xi16_i32: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm2 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 - %res1 = add <16 x i16> , %a - %res2 = and <16 x i16> , %res1 - ret <16 x i16> %res2 -} - - -; ALL64: .LCPI15 -; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000 - -; ALL32: .LCPI15 -; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 +; AVX-NEXT: retl +; +; ALL32-LABEL: f16xi16_i32: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536] +; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f16xi16_i32: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f16xi16_i32: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536] +; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq + %res1 = add <16 x i16> , %a + %res2 = and <16 x i16> , %res1 + ret <16 x i16> %res2 +} -; AVX: .LCPI15 -; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 define <16 x i16> @f16xi16_i64(<16 x i16> %a) { -; ALL-LABEL: f16xi16_i64: -; ALL: # BB#0: -; ALL-NEXT: vpbroadcastq {{.*}}, %ymm1 -; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ; AVX-LABEL: f16xi16_i64: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -540,60 +756,154 @@ ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f16xi16_i64: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] +; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f16xi16_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f16xi16_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [844433520132096,844433520132096,844433520132096,844433520132096] +; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq %res1 = add <16 x i16> , %a %res2 = and <16 x i16> , %res1 ret <16 x i16> %res2 } -; ALL: .LCPI16 -; ALL-NEXT: .short 0 # 0x0 -; ALL-NEXT: .short 1 # 0x1 -; ALL-NEXT: .short 2 # 0x2 -; ALL-NEXT: .short 3 # 0x3 -; ALL-NEXT: .short 4 # 0x4 -; ALL-NEXT: .short 5 # 0x5 -; ALL-NEXT: .short 6 # 0x6 -; ALL-NEXT: .short 7 # 0x7 -; ALL-NOT: .short - define <16 x i16> @f16xi16_i128(<16 x i16> %a) { -; ALL-LABEL: f16xi16_i128: -; ALL: # BB#0: -; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX-LABEL: f16xi16_i128: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f16xi16_i128: +; ALL32: # BB#0: +; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; ALL32-NEXT: # ymm1 = mem[0,1,0,1] +; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f16xi16_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f16xi16_i128: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; ALL64-NEXT: # ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq %res1 = add <16 x i16> , %a %res2 = and <16 x i16> , %res1 ret <16 x i16> %res2 } -; ALL: .LCPI17 -; ALL-NEXT: .long 65536 # 0x10000 - -; AVX: .LCPI17 -; AVX-NEXT: .long 65536 # float 9.18354962E-41 - define <32 x i16> @f32xi16_i32(<32 x i16> %a) { +; AVX-LABEL: f32xi16_i32: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f32xi16_i32: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] ; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536] ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl ; -; AVX-LABEL: f32xi16_i32: +; AVX-64-LABEL: f32xi16_i32: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f32xi16_i32: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f32xi16_i32: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536] +; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <32 x i16> , %a + %res2 = and <32 x i16> , %res1 + ret <32 x i16> %res2 +} + + +define <32 x i16> @f32xi16_i64(<32 x i16> %a) { +; AVX-LABEL: f32xi16_i64: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm3 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -601,43 +911,69 @@ ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 - %res1 = add <32 x i16> , %a - %res2 = and <32 x i16> , %res1 - ret <32 x i16> %res2 -} - - -; ALL64: .LCPI18 -; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000 - -; ALL32: .LCPI18 -; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 - -; AVX: .LCPI18 -; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 - -define <32 x i16> @f32xi16_i64(<32 x i16> %a) { +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f32xi16_i64: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vpbroadcastq {{.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] ; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i64: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpbroadcastq {{.*}}, %zmm1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl ; -; AVX-LABEL: f32xi16_i64: +; AVX-64-LABEL: f32xi16_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f32xi16_i64: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096] +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f32xi16_i64: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096] +; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <32 x i16> , %a + %res2 = and <32 x i16> , %res1 + ret <32 x i16> %res2 +} + + +define <32 x i16> @f32xi16_i128(<32 x i16> %a) { +; AVX-LABEL: f32xi16_i128: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -645,87 +981,151 @@ ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 - %res1 = add <32 x i16> , %a - %res2 = and <32 x i16> , %res1 - ret <32 x i16> %res2 -} - - -; ALL: .LCPI19 -; ALL-NEXT: .short 0 # 0x0 -; ALL-NEXT: .short 1 # 0x1 -; ALL-NEXT: .short 2 # 0x2 -; ALL-NEXT: .short 3 # 0x3 -; ALL-NEXT: .short 4 # 0x4 -; ALL-NEXT: .short 5 # 0x5 -; ALL-NEXT: .short 6 # 0x6 -; ALL-NEXT: .short 7 # 0x7 -; ALL-NOT: .short - -define <32 x i16> @f32xi16_i128(<32 x i16> %a) { +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f32xi16_i128: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i128: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl +; +; AVX-64-LABEL: f32xi16_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f32xi16_i128: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1] +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f32xi16_i128: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq %res1 = add <32 x i16> , %a %res2 = and <32 x i16> , %res1 ret <32 x i16> %res2 } -; AVX512BW: .LCPI20 -; AVX512BW-NEXT: .short 0 # 0x0 -; AVX512BW-NEXT: .short 1 # 0x1 -; AVX512BW-NEXT: .short 2 # 0x2 -; AVX512BW-NEXT: .short 3 # 0x3 -; AVX512BW-NEXT: .short 4 # 0x4 -; AVX512BW-NEXT: .short 5 # 0x5 -; AVX512BW-NEXT: .short 6 # 0x6 -; AVX512BW-NEXT: .short 7 # 0x7 -; AVX512BW-NEXT: .short 8 # 0x8 -; AVX512BW-NEXT: .short 9 # 0x9 -; AVX512BW-NEXT: .short 10 # 0xa -; AVX512BW-NEXT: .short 11 # 0xb -; AVX512BW-NEXT: .short 12 # 0xc -; AVX512BW-NEXT: .short 13 # 0xd -; AVX512BW-NEXT: .short 14 # 0xe -; AVX512BW-NEXT: .short 15 # 0xf -; AVX512BW-NOT: .short - define <32 x i16> @f32xi16_i256(<32 x i16> %a) { +; AVX-LABEL: f32xi16_i256: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7] +; AVX-NEXT: vpaddw %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; +; NO-AVX512BW-LABEL: f32xi16_i256: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl +; ; AVX512BW-LABEL: f32xi16_i256: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl +; +; AVX-64-LABEL: f32xi16_i256: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddw %xmm4, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm4, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f32xi16_i256: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f32xi16_i256: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq %res1 = add <32 x i16> , %a %res2 = and <32 x i16> , %res1 ret <32 x i16> %res2 } -; ALL64: .LCPI21 -; ALL64-NEXT: .quad 4294967296 # 0x100000000 - -; ALL32: .LCPI21 -; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 - -; AVX: .LCPI21 -; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 define <4 x i32> @f4xi32_i64(<4 x i32> %a) { +; AVX-LABEL: f4xi32_i64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f4xi32_i64: ; ALL32: # BB#0: ; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] @@ -733,40 +1133,26 @@ ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f4xi32_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f4xi32_i64: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967296,4294967296] ; ALL64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f4xi32_i64: -; AVX: # BB#0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 %res1 = add <4 x i32> , %a %res2 = and <4 x i32> , %res1 ret <4 x i32> %res2 } -; ALL64: .LCPI22 -; ALL64-NEXT: .quad 4294967296 # 0x100000000 - -; ALL32: .LCPI22 -; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 - -; AVX: .LCPI22 -; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 - define <8 x i32> @f8xi32_i64(<8 x i32> %a) { -; ALL-LABEL: f8xi32_i64: -; ALL: # BB#0: -; ALL-NEXT: vpbroadcastq {{.*}}, %ymm1 -; ALL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ; AVX-LABEL: f8xi32_i64: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -775,59 +1161,154 @@ ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f8xi32_i64: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] +; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f8xi32_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f8xi32_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967296,4294967296,4294967296,4294967296] +; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq %res1 = add <8 x i32> , %a %res2 = and <8 x i32> , %res1 ret <8 x i32> %res2 } -; ALL: .LCPI23 -; ALL-NEXT: .long 0 # 0x0 -; ALL-NEXT: .long 1 # 0x1 -; ALL-NEXT: .long 2 # 0x2 -; ALL-NEXT: .long 3 # 0x3 -; ALL-NOT: .long - define <8 x i32> @f8xi32_i128(<8 x i32> %a) { -; ALL-LABEL: f8xi32_i128: -; ALL: # BB#0: -; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; ALL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX-LABEL: f8xi32_i128: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3] +; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f8xi32_i128: +; ALL32: # BB#0: +; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] +; ALL32-NEXT: # ymm1 = mem[0,1,0,1] +; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f8xi32_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3] +; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f8xi32_i128: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] +; ALL64-NEXT: # ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq %res1 = add <8 x i32> , %a %res2 = and <8 x i32> , %res1 ret <8 x i32> %res2 } -; ALL64: .LCPI24 -; ALL64-NEXT: .quad 4294967296 # 0x100000000 - -; ALL32: .LCPI24 -; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 - -; AVX: .LCPI24 -; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 - define <16 x i32> @f16xi32_i64(<16 x i32> %a) { +; AVX-LABEL: f16xi32_i64: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; ; AVX2-LABEL: f16xi32_i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq {{.*}}, %ymm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl ; ; AVX512-LABEL: f16xi32_i64: ; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastq {{.*}}, %zmm1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retl ; -; AVX-LABEL: f16xi32_i64: +; AVX-64-LABEL: f16xi32_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f16xi32_i64: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296] +; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xi32_i64: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296] +; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: retq + %res1 = add <16 x i32> , %a + %res2 = and <16 x i32> , %res1 + ret <16 x i32> %res2 +} + + +define <16 x i32> @f16xi32_i128(<16 x i32> %a) { +; AVX-LABEL: f16xi32_i128: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -835,51 +1316,103 @@ ; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1] +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 - %res1 = add <16 x i32> , %a - %res2 = and <16 x i32> , %res1 - ret <16 x i32> %res2 -} - - -; ALL: .LCPI25 -; ALL-NEXT: .long 0 # 0x0 -; ALL-NEXT: .long 1 # 0x1 -; ALL-NEXT: .long 2 # 0x2 -; ALL-NEXT: .long 3 # 0x3 -; ALL-NOT: .long - -define <16 x i32> @f16xi32_i128(<16 x i32> %a) { +; AVX-NEXT: retl +; ; AVX2-LABEL: f16xi32_i128: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl ; ; AVX512-LABEL: f16xi32_i128: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retl +; +; AVX-64-LABEL: f16xi32_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f16xi32_i128: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] +; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xi32_i128: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: retq %res1 = add <16 x i32> , %a %res2 = and <16 x i32> , %res1 ret <16 x i32> %res2 } -; ALL64: .LCPI26 -; ALL64-NEXT: .quad 0 # 0x0 -; ALL64-NEXT: .quad 1 # 0x1 -; ALL64-NOT: .quad - define <4 x i64> @f4xi64_i128(<4 x i64> %a) { +; AVX-LABEL: f4xi64_i128: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f4xi64_i128: +; ALL32: # BB#0: +; ALL32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; ALL32-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f4xi64_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: movl $1, %eax +; AVX-64-NEXT: vmovq %rax, %xmm2 +; AVX-64-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f4xi64_i128: ; ALL64: # BB#0: -; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,0,1] +; ALL64-NEXT: # ymm1 = mem[0,1,0,1] ; ALL64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq @@ -889,15 +1422,62 @@ } -; ALL64: .LCPI27 -; ALL64-NEXT: .quad 0 # 0x0 -; ALL64-NEXT: .quad 1 # 0x1 -; ALL64-NOT: .quad - define <8 x i64> @f8xi64_i128(<8 x i64> %a) { +; AVX-LABEL: f8xi64_i128: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm4 +; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; +; AVX2-LABEL: f8xi64_i128: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] +; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl +; +; AVX512-LABEL: f8xi64_i128: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0] +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retl +; +; AVX-64-LABEL: f8xi64_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: movl $1, %eax +; AVX-64-NEXT: vmovq %rax, %xmm3 +; AVX-64-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,0,1] +; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; ; AVX2-64-LABEL: f8xi64_i128: ; AVX2-64: # BB#0: -; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,0,1] +; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -906,57 +1486,99 @@ ; ; AVX512F-64-LABEL: f8xi64_i128: ; AVX512F-64: # BB#0: -; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq -; -; AVX512BW-64-LABEL: f8xi64_i128: -; AVX512BW-64: # BB#0: -; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-64-NEXT: retq %res1 = add <8 x i64> , %a %res2 = and <8 x i64> , %res1 ret <8 x i64> %res2 } -; ALL64: .LCPI28 -; ALL64-NEXT: .quad 0 # 0x0 -; ALL64-NEXT: .quad 1 # 0x1 -; ALL64-NEXT: .quad 2 # 0x2 -; ALL64-NEXT: .quad 3 # 0x3 -; ALL64-NOT: .quad - define <8 x i64> @f8xi64_i256(<8 x i64> %a) { +; AVX-LABEL: f8xi64_i256: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm4 +; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; +; AVX2-LABEL: f8xi64_i256: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0] +; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl +; +; AVX512-LABEL: f8xi64_i256: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0] +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retl +; +; AVX-64-LABEL: f8xi64_i256: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3] +; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: movl $1, %eax +; AVX-64-NEXT: vmovq %rax, %xmm4 +; AVX-64-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f8xi64_i256: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3] +; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq +; ; AVX512F-64-LABEL: f8xi64_i256: ; AVX512F-64: # BB#0: -; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq -; -; AVX512BW-64-LABEL: f8xi64_i256: -; AVX512BW-64: # BB#0: -; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-64-NEXT: retq %res1 = add <8 x i64> , %a %res2 = and <8 x i64> , %res1 ret <8 x i64> %res2 } -; ALL: .LCPI29 -; ALL-NEXT: .quad 4575657222482165760 - -; AVX: .LCPI29 -; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 - define <4 x float> @f4xf32_f64(<4 x float> %a) { +; AVX-LABEL: f4xf32_f64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f4xf32_f64: ; ALL32: # BB#0: ; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] @@ -964,221 +1586,367 @@ ; ALL32-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f4xf32_f64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-64-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f4xf32_f64: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760] ; ALL64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f4xf32_f64: -; AVX: # BB#0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 %res1 = fadd <4 x float> , %a %res2 = fdiv <4 x float> , %res1 ret <4 x float> %res2 } -; ALL64: .LCPI30 -; ALL64-NEXT: .quad 4575657222482165760 # 0x3f80000040000000 - -; ALL32: .LCPI30 -; ALL32-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 - -; AVX: .LCPI30 -; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 - define <8 x float> @f8xf32_f64(<8 x float> %a) { -; ALL-LABEL: f8xf32_f64: -; ALL: # BB#0: -; ALL-NEXT: vbroadcastsd {{.*}}, %ymm1 -; ALL-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; ; AVX-LABEL: f8xf32_f64: ; AVX: # BB#0: -; AVX-NEXT: vbroadcastsd {{\.LCPI.*}}, %ymm1 +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f8xf32_f64: +; ALL32: # BB#0: +; ALL32-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] +; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f8xf32_f64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] +; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f8xf32_f64: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; ALL64-NEXT: retq %res1 = fadd <8 x float> , %a %res2 = fdiv <8 x float> , %res1 ret <8 x float> %res2 } -; ALL: .LCPI31 -; ALL-NEXT: .long 1082130432 # float 4 -; ALL-NEXT: .long 1065353216 # float 1 -; ALL-NEXT: .long 1073741824 # float 2 -; ALL-NEXT: .long 1077936128 # float 3 -; ALL-NOT: .long - define <8 x float> @f8xf32_f128(<8 x float> %a) { -; ALL-LABEL: f8xf32_f128: -; ALL: # BB#0: -; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; ALL-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; ; AVX-LABEL: f8xf32_f128: ; AVX: # BB#0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX-NEXT: # ymm1 = mem[0,1,0,1] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f8xf32_f128: +; ALL32: # BB#0: +; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; ALL32-NEXT: # ymm1 = mem[0,1,0,1] +; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f8xf32_f128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f8xf32_f128: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; ALL64-NEXT: # ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; ALL64-NEXT: retq %res1 = fadd <8 x float> , %a %res2 = fdiv <8 x float> , %res1 ret <8 x float> %res2 } -; ALL64: .LCPI32 -; ALL64-NEXT: .quad 4575657222482165760 # 0x3f80000040000000 - -; ALL32: .LCPI32 -; ALL32-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 - -; AVX: .LCPI32 -; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 - define <16 x float> @f16xf32_f64(<16 x float> %a) { +; AVX-LABEL: f16xf32_f64: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: retl +; ; AVX2-LABEL: f16xf32_f64: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastsd {{.*}}, %ymm2 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retl ; ; AVX512-LABEL: f16xf32_f64: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastsd {{.*}}, %zmm1 +; AVX512-NEXT: vbroadcastsd {{.*#+}} zmm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retl ; -; AVX-LABEL: f16xf32_f64: -; AVX: # BB#0: -; AVX-NEXT: vbroadcastsd {{\.LCPI.*}}, %ymm2 -; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-64-LABEL: f16xf32_f64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] +; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f16xf32_f64: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xf32_f64: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512F-64-NEXT: retq %res1 = fadd <16 x float> , %a %res2 = fdiv <16 x float> , %res1 ret <16 x float> %res2 } -; ALL: .LCPI33 -; ALL-NEXT: .long 1082130432 # float 4 -; ALL-NEXT: .long 1065353216 # float 1 -; ALL-NEXT: .long 1073741824 # float 2 -; ALL-NEXT: .long 1077936128 # float 3 -; ALL-NOT: .long - define <16 x float> @f16xf32_f128(<16 x float> %a) { +; AVX-LABEL: f16xf32_f128: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: retl +; ; AVX2-LABEL: f16xf32_f128: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retl ; ; AVX512-LABEL: f16xf32_f128: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retl ; -; AVX-LABEL: f16xf32_f128: -; AVX: # BB#0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-64-LABEL: f16xf32_f128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f16xf32_f128: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xf32_f128: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512F-64-NEXT: retq %res1 = fadd <16 x float> , %a %res2 = fdiv <16 x float> , %res1 ret <16 x float> %res2 } -; AVX512: .LCPI34 -; AVX512-NEXT: .long 1090519040 # float 8 -; AVX512-NEXT: .long 1065353216 # float 1 -; AVX512-NEXT: .long 1073741824 # float 2 -; AVX512-NEXT: .long 1077936128 # float 3 -; AVX512-NEXT: .long 1082130432 # float 4 -; AVX512-NEXT: .long 1084227584 # float 5 -; AVX512-NEXT: .long 1086324736 # float 6 -; AVX512-NEXT: .long 1088421888 # float 7 -; AVX512-NOT: .long - define <16 x float> @f16xf32_f256(<16 x float> %a) { +; AVX-LABEL: f16xf32_f256: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00] +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: retl +; +; AVX2-LABEL: f16xf32_f256: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00] +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retl +; ; AVX512-LABEL: f16xf32_f256: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retl +; +; AVX-64-LABEL: f16xf32_f256: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00] +; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f16xf32_f256: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00] +; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xf32_f256: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512F-64-NEXT: retq %res1 = fadd <16 x float> , %a %res2 = fdiv <16 x float> , %res1 ret <16 x float> %res2 } -; ALL: .LCPI35 -; ALL-NEXT: .quad 4611686018427387904 # double 2 -; ALL-NEXT: .quad 4607182418800017408 # double 1 -; ALL-NOT: .quad - define <4 x double> @f4xf64_f128(<4 x double> %a) { -; ALL-LABEL: f4xf64_f128: -; ALL: # BB#0: -; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; ALL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vdivpd %ymm0, %ymm1, %ymm0 -; ; AVX-LABEL: f4xf64_f128: ; AVX: # BB#0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX-NEXT: # ymm1 = mem[0,1,0,1] ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f4xf64_f128: +; ALL32: # BB#0: +; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; ALL32-NEXT: # ymm1 = mem[0,1,0,1] +; ALL32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f4xf64_f128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-64-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f4xf64_f128: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; ALL64-NEXT: # ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; ALL64-NEXT: retq %res1 = fadd <4 x double> , %a %res2 = fdiv <4 x double> , %res1 ret <4 x double> %res2 } -; ALL: .LCPI36 -; ALL-NEXT: .quad 4611686018427387904 # double 2 -; ALL-NEXT: .quad 4607182418800017408 # double 1 -; ALL-NOT: .quad - define <8 x double> @f8xf64_f128(<8 x double> %a) { +; AVX-LABEL: f8xf64_f128: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX-NEXT: retl +; ; AVX2-LABEL: f8xf64_f128: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retl ; ; AVX512-LABEL: f8xf64_f128: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retl ; -; AVX-LABEL: f8xf64_f128: -; AVX: # BB#0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX-64-LABEL: f8xf64_f128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f8xf64_f128: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f8xf64_f128: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vdivpd %zmm0, %zmm1, %zmm0 +; AVX512F-64-NEXT: retq %res1 = fadd <8 x double> , %a %res2 = fdiv <8 x double> , %res1 ret <8 x double> %res2 @@ -1193,11 +1961,57 @@ ; AVX512-NOT: .quad define <8 x double> @f8xf64_f256(<8 x double> %a) { +; AVX-LABEL: f8xf64_f256: +; AVX: # BB#0: +; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX-NEXT: retl +; +; AVX2-LABEL: f8xf64_f256: +; AVX2: # BB#0: +; AVX2-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retl +; ; AVX512-LABEL: f8xf64_f256: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retl +; +; AVX-64-LABEL: f8xf64_f256: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f8xf64_f256: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f8xf64_f256: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vdivpd %zmm0, %zmm1, %zmm0 +; AVX512F-64-NEXT: retq %res1 = fadd <8 x double> , %a %res2 = fdiv <8 x double> , %res1 ret <8 x double> %res2 @@ -1205,32 +2019,34 @@ -; ALL: .LCPI38 -; ALL-NEXT: .long 4290379776 # 0xffba0000 - -; AVX: .LCPI38 -; AVX-NEXT: .long 4290379776 # float NaN - define <8 x i16> @f8xi16_i32_NaN(<8 x i16> %a) { +; AVX-LABEL: f8xi16_i32_NaN: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f8xi16_i32_NaN: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776] ; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f8xi16_i32_NaN: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f8xi16_i32_NaN: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776] ; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f8xi16_i32_NaN: -; AVX: # BB#0: -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 %res1 = add <8 x i16> , %a %res2 = and <8 x i16> , %res1 ret <8 x i16> %res2