Index: llvm/lib/Target/X86/X86InstrFoldTables.h =================================================================== --- llvm/lib/Target/X86/X86InstrFoldTables.h +++ llvm/lib/Target/X86/X86InstrFoldTables.h @@ -19,35 +19,48 @@ enum { // Select which memory operand is being unfolded. - // (stored in bits 0 - 3) + // (stored in bits 0 - 2) TB_INDEX_0 = 0, TB_INDEX_1 = 1, TB_INDEX_2 = 2, TB_INDEX_3 = 3, TB_INDEX_4 = 4, - TB_INDEX_MASK = 0xf, + TB_INDEX_MASK = 0x7, // Do not insert the reverse map (MemOp -> RegOp) into the table. // This may be needed because there is a many -> one mapping. - TB_NO_REVERSE = 1 << 4, + TB_NO_REVERSE = 1 << 3, // Do not insert the forward map (RegOp -> MemOp) into the table. // This is needed for Native Client, which prohibits branch // instructions from using a memory operand. - TB_NO_FORWARD = 1 << 5, + TB_NO_FORWARD = 1 << 4, - TB_FOLDED_LOAD = 1 << 6, - TB_FOLDED_STORE = 1 << 7, + TB_FOLDED_LOAD = 1 << 5, + TB_FOLDED_STORE = 1 << 6, + TB_FOLDED_BCAST = 1 << 7, // Minimum alignment required for load/store. - // Used for RegOp->MemOp conversion. - // (stored in bits 8 - 15) + // Used for RegOp->MemOp conversion. Encoded as Log2(Align) + 1 to allow 0 + // to mean align of 0. + // (stored in bits 8 - 11) TB_ALIGN_SHIFT = 8, - TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, - TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT, - TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT, - TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT, - TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT + TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, + TB_ALIGN_16 = 5 << TB_ALIGN_SHIFT, + TB_ALIGN_32 = 6 << TB_ALIGN_SHIFT, + TB_ALIGN_64 = 7 << TB_ALIGN_SHIFT, + TB_ALIGN_MASK = 0xf << TB_ALIGN_SHIFT, + + // Broadcast type. + // (stored in bits 12 - 15) + TB_BCAST_TYPE_SHIFT = 12, + TB_BCAST_B = 0 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_W = 1 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_D = 2 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_Q = 3 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_SS = 4 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_SD = 5 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_MASK = 0xf << TB_BCAST_TYPE_SHIFT, }; // This struct is used for both the folding and unfold tables. They KeyOp Index: llvm/lib/Target/X86/X86InstrFoldTables.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -5245,6 +5245,69 @@ { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 }, }; +static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = { + { X86::VADDPDZ128rr, X86::VADDPDZ128rmb, TB_BCAST_SD }, + { X86::VADDPDZ256rr, X86::VADDPDZ256rmb, TB_BCAST_SD }, + { X86::VADDPDZrr, X86::VADDPDZrmb, TB_BCAST_SD }, + { X86::VADDPSZ128rr, X86::VADDPSZ128rmb, TB_BCAST_SS }, + { X86::VADDPSZ256rr, X86::VADDPSZ256rmb, TB_BCAST_SS }, + { X86::VADDPSZrr, X86::VADDPSZrmb, TB_BCAST_SS }, + { X86::VDIVPDZ128rr, X86::VDIVPDZ128rmb, TB_BCAST_SD }, + { X86::VDIVPDZ256rr, X86::VDIVPDZ256rmb, TB_BCAST_SD }, + { X86::VDIVPDZrr, X86::VDIVPDZrmb, TB_BCAST_SD }, + { X86::VDIVPSZ128rr, X86::VDIVPSZ128rmb, TB_BCAST_SS }, + { X86::VDIVPSZ256rr, X86::VDIVPSZ256rmb, TB_BCAST_SS }, + { X86::VDIVPSZrr, X86::VDIVPSZrmb, TB_BCAST_SS }, + { X86::VMULPDZ128rr, X86::VMULPDZ128rmb, TB_BCAST_SD }, + { X86::VMULPDZ256rr, X86::VMULPDZ256rmb, TB_BCAST_SD }, + { X86::VMULPDZrr, X86::VMULPDZrmb, TB_BCAST_SD }, + { X86::VMULPSZ128rr, X86::VMULPSZ128rmb, TB_BCAST_SS }, + { X86::VMULPSZ256rr, X86::VMULPSZ256rmb, TB_BCAST_SS }, + { X86::VMULPSZrr, X86::VMULPSZrmb, TB_BCAST_SS }, + { X86::VPADDDZ128rr, X86::VPADDDZ128rmb, TB_BCAST_D }, + { X86::VPADDDZ256rr, X86::VPADDDZ256rmb, TB_BCAST_D }, + { X86::VPADDDZrr, X86::VPADDDZrmb, TB_BCAST_D }, + { X86::VPADDQZ128rr, X86::VPADDQZ128rmb, TB_BCAST_Q }, + { X86::VPADDQZ256rr, X86::VPADDQZ256rmb, TB_BCAST_Q }, + { X86::VPADDQZrr, X86::VPADDQZrmb, TB_BCAST_Q }, + { X86::VPANDDZ128rr, X86::VPANDDZ128rmb, TB_BCAST_D }, + { X86::VPANDDZ256rr, X86::VPANDDZ256rmb, TB_BCAST_D }, + { X86::VPANDDZrr, X86::VPANDDZrmb, TB_BCAST_D }, + { X86::VPANDNDZ128rr, X86::VPANDNDZ128rmb, TB_BCAST_D }, + { X86::VPANDNDZ256rr, X86::VPANDNDZ256rmb, TB_BCAST_D }, + { X86::VPANDNDZrr, X86::VPANDNDZrmb, TB_BCAST_D }, + { X86::VPANDNQZ128rr, X86::VPANDNQZ128rmb, TB_BCAST_Q }, + { X86::VPANDNQZ256rr, X86::VPANDNQZ256rmb, TB_BCAST_Q }, + { X86::VPANDNQZrr, X86::VPANDNQZrmb, TB_BCAST_Q }, + { X86::VPANDQZ128rr, X86::VPANDQZ128rmb, TB_BCAST_Q }, + { X86::VPANDQZ256rr, X86::VPANDQZ256rmb, TB_BCAST_Q }, + { X86::VPANDQZrr, X86::VPANDQZrmb, TB_BCAST_Q }, + { X86::VPMULLDZ128rr, X86::VPMULLDZ128rmb, TB_BCAST_D }, + { X86::VPMULLDZ256rr, X86::VPMULLDZ256rmb, TB_BCAST_D }, + { X86::VPMULLDZrr, X86::VPMULLDZrmb, TB_BCAST_D }, + { X86::VPMULLQZ128rr, X86::VPMULLQZ128rmb, TB_BCAST_Q }, + { X86::VPMULLQZ256rr, X86::VPMULLQZ256rmb, TB_BCAST_Q }, + { X86::VPMULLQZrr, X86::VPMULLQZrmb, TB_BCAST_Q }, + { X86::VPORDZ128rr, X86::VPORDZ128rmb, TB_BCAST_D }, + { X86::VPORDZ256rr, X86::VPORDZ256rmb, TB_BCAST_D }, + { X86::VPORDZrr, X86::VPORDZrmb, TB_BCAST_D }, + { X86::VPORQZ128rr, X86::VPORQZ128rmb, TB_BCAST_Q }, + { X86::VPORQZ256rr, X86::VPORQZ256rmb, TB_BCAST_Q }, + { X86::VPORQZrr, X86::VPORQZrmb, TB_BCAST_Q }, + { X86::VPXORDZ128rr, X86::VPXORDZ128rmb, TB_BCAST_D }, + { X86::VPXORDZ256rr, X86::VPXORDZ256rmb, TB_BCAST_D }, + { X86::VPXORDZrr, X86::VPXORDZrmb, TB_BCAST_D }, + { X86::VPXORQZ128rr, X86::VPXORQZ128rmb, TB_BCAST_Q }, + { X86::VPXORQZ256rr, X86::VPXORQZ256rmb, TB_BCAST_Q }, + { X86::VPXORQZrr, X86::VPXORQZrmb, TB_BCAST_Q }, + { X86::VSUBPDZ128rr, X86::VSUBPDZ128rmb, TB_BCAST_SD }, + { X86::VSUBPDZ256rr, X86::VSUBPDZ256rmb, TB_BCAST_SD }, + { X86::VSUBPDZrr, X86::VSUBPDZrmb, TB_BCAST_SD }, + { X86::VSUBPSZ128rr, X86::VSUBPSZ128rmb, TB_BCAST_SS }, + { X86::VSUBPSZ256rr, X86::VSUBPSZ256rmb, TB_BCAST_SS }, + { X86::VSUBPSZrr, X86::VSUBPSZrmb, TB_BCAST_SS }, +}; + static const X86MemoryFoldTableEntry * lookupFoldTableImpl(ArrayRef Table, unsigned RegOp) { #ifndef NDEBUG @@ -5287,6 +5350,12 @@ std::end(MemoryFoldTable4)) == std::end(MemoryFoldTable4) && "MemoryFoldTable4 is not sorted and unique!"); + assert(std::is_sorted(std::begin(BroadcastFoldTable2), + std::end(BroadcastFoldTable2)) && + std::adjacent_find(std::begin(BroadcastFoldTable2), + std::end(BroadcastFoldTable2)) == + std::end(BroadcastFoldTable2) && + "BroadcastFoldTable2 is not sorted and unique!"); FoldTablesChecked.store(true, std::memory_order_relaxed); } #endif @@ -5355,6 +5424,11 @@ // Index 4, folded load addTableEntry(Entry, TB_INDEX_4 | TB_FOLDED_LOAD); + // Broadcast tables. + for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable2) + // Index 2, folded broadcast + addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST); + // Sort the memory->reg unfold table. array_pod_sort(Table.begin(), Table.end()); Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4805,6 +4805,7 @@ if (I != nullptr) { unsigned Opcode = I->DstOp; unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; + MinAlign = MinAlign ? 1 << (MinAlign - 1) : 0; if (Align < MinAlign) return nullptr; bool NarrowToMOV32rm = false; @@ -5310,6 +5311,69 @@ return StoreMMOs; } +static unsigned getBroadcastOpcode(const X86MemoryFoldTableEntry *I, + const TargetRegisterClass *RC, + const X86Subtarget &STI) { + assert(STI.hasAVX512() && "Expected at least AVX512!"); + unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC); + assert((SpillSize == 64 || STI.hasVLX()) && + "Can't broadcast less than 64 bytes without AVX512VL!"); + + switch (I->Flags & TB_BCAST_MASK) { + default: llvm_unreachable("Unexpected broadcast type!"); + case TB_BCAST_B: + assert(STI.hasBWI() && "Byte broadcast requires avx512bw!"); + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VPBROADCASTBZ128m; + case 32: return X86::VPBROADCASTBZ256m; + case 64: return X86::VPBROADCASTBZm; + } + break; + case TB_BCAST_W: + assert(STI.hasBWI() && "Word broadcast requires avx512bw!"); + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VPBROADCASTWZ128m; + case 32: return X86::VPBROADCASTWZ256m; + case 64: return X86::VPBROADCASTWZm; + } + break; + case TB_BCAST_D: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VPBROADCASTDZ128m; + case 32: return X86::VPBROADCASTDZ256m; + case 64: return X86::VPBROADCASTDZm; + } + break; + case TB_BCAST_Q: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VPBROADCASTQZ128m; + case 32: return X86::VPBROADCASTQZ256m; + case 64: return X86::VPBROADCASTQZm; + } + break; + case TB_BCAST_SS: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VBROADCASTSSZ128m; + case 32: return X86::VBROADCASTSSZ256m; + case 64: return X86::VBROADCASTSSZm; + } + break; + case TB_BCAST_SD: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VMOVDDUPZ128rm; + case 32: return X86::VBROADCASTSDZ256m; + case 64: return X86::VBROADCASTSDZm; + } + break; + } +} + bool X86InstrInfo::unfoldMemoryOperand( MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl &NewMIs) const { @@ -5320,6 +5384,7 @@ unsigned Index = I->Flags & TB_INDEX_MASK; bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; bool FoldedStore = I->Flags & TB_FOLDED_STORE; + bool FoldedBCast = I->Flags & TB_FOLDED_BCAST; if (UnfoldLoad && !FoldedLoad) return false; UnfoldLoad &= FoldedLoad; @@ -5328,6 +5393,7 @@ UnfoldStore &= FoldedStore; const MCInstrDesc &MCID = get(Opc); + const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); // TODO: Check if 32-byte or greater accesses are slow too? @@ -5353,12 +5419,19 @@ AfterOps.push_back(Op); } - // Emit the load instruction. + // Emit the load or broadcast instruction. if (UnfoldLoad) { auto MMOs = extractLoadMMOs(MI.memoperands(), MF); - unsigned Alignment = std::max(TRI.getSpillSize(*RC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; - unsigned Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget); + + unsigned Opc; + if (FoldedBCast) { + Opc = getBroadcastOpcode(I, RC, Subtarget); + } else { + unsigned Alignment = std::max(TRI.getSpillSize(*RC), 16); + bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget); + } + DebugLoc DL; MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg); for (unsigned i = 0, e = AddrOps.size(); i != e; ++i) @@ -5459,6 +5532,7 @@ unsigned Index = I->Flags & TB_INDEX_MASK; bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; bool FoldedStore = I->Flags & TB_FOLDED_STORE; + bool FoldedBCast = I->Flags & TB_FOLDED_BCAST; const MCInstrDesc &MCID = get(Opc); MachineFunction &MF = DAG.getMachineFunction(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); @@ -5492,10 +5566,17 @@ return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. - unsigned Alignment = std::max(TRI.getSpillSize(*RC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; - Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl, - VT, MVT::Other, AddrOps); + + unsigned Opc; + if (FoldedBCast) { + Opc = getBroadcastOpcode(I, RC, Subtarget); + } else { + unsigned Alignment = std::max(TRI.getSpillSize(*RC), 16); + bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget); + } + + Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps); NewNodes.push_back(Load); // Preserve memory reference information. Index: llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -8,12 +8,12 @@ ; CHECK-LABEL: bcast_unfold_add_v16i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0 -; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -42,12 +42,12 @@ ; CHECK-LABEL: bcast_unfold_add_v8i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0 -; CHECK-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax) +; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -76,12 +76,12 @@ ; CHECK-LABEL: bcast_unfold_add_v4i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB2_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0 -; CHECK-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB2_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -109,12 +109,12 @@ ; CHECK-LABEL: bcast_unfold_add_v8i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB3_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0 -; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax) +; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB3_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -143,12 +143,12 @@ ; CHECK-LABEL: bcast_unfold_add_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB4_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0 -; CHECK-NEXT: vpaddq {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax) +; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB4_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -210,12 +210,12 @@ ; CHECK-LABEL: bcast_unfold_mul_v16i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB6_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0 -; CHECK-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB6_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -244,12 +244,12 @@ ; CHECK-LABEL: bcast_unfold_mul_v8i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB7_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0 -; CHECK-NEXT: vpmulld {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax) +; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB7_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -278,12 +278,12 @@ ; CHECK-LABEL: bcast_unfold_mul_v4i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB8_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0 -; CHECK-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB8_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -415,12 +415,12 @@ ; CHECK-LABEL: bcast_unfold_or_v16i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB12_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0 -; CHECK-NEXT: vpord {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vpord 4096(%rdi,%rax), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB12_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -449,12 +449,12 @@ ; CHECK-LABEL: bcast_unfold_or_v8i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB13_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0 -; CHECK-NEXT: vpord {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax) +; CHECK-NEXT: vorps 4096(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB13_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -483,12 +483,12 @@ ; CHECK-LABEL: bcast_unfold_or_v4i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB14_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0 -; CHECK-NEXT: vpord {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vorps 4096(%rdi,%rax), %xmm0, %xmm1 +; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB14_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -516,12 +516,12 @@ ; CHECK-LABEL: bcast_unfold_or_v8i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB15_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0 -; CHECK-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax) +; CHECK-NEXT: vporq 8192(%rdi,%rax), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB15_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -550,12 +550,12 @@ ; CHECK-LABEL: bcast_unfold_or_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB16_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0 -; CHECK-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax) +; CHECK-NEXT: vorps 8192(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB16_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -617,12 +617,12 @@ ; CHECK-LABEL: bcast_unfold_fneg_v16f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB18_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0 -; CHECK-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vpxord 4096(%rdi,%rax), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB18_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -651,12 +651,12 @@ ; CHECK-LABEL: bcast_unfold_fneg_v8f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB19_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0 -; CHECK-NEXT: vpxord {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax) +; CHECK-NEXT: vxorps 4096(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB19_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -685,12 +685,12 @@ ; CHECK-LABEL: bcast_unfold_fneg_v4f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB20_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0 -; CHECK-NEXT: vpxord {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vxorps 4096(%rdi,%rax), %xmm0, %xmm1 +; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB20_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -718,12 +718,12 @@ ; CHECK-LABEL: bcast_unfold_fneg_v8f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB21_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0 -; CHECK-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax) +; CHECK-NEXT: vpxorq 8192(%rdi,%rax), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB21_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -752,12 +752,12 @@ ; CHECK-LABEL: bcast_unfold_fneg_v4f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB22_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0 -; CHECK-NEXT: vpxorq {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax) +; CHECK-NEXT: vxorps 8192(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB22_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -819,12 +819,12 @@ ; CHECK-LABEL: bcast_unfold_fabs_v16f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB24_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0 -; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vpandd 4096(%rdi,%rax), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB24_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -856,12 +856,12 @@ ; CHECK-LABEL: bcast_unfold_fabs_v8f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB25_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0 -; CHECK-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax) +; CHECK-NEXT: vandps 4096(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB25_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -893,12 +893,12 @@ ; CHECK-LABEL: bcast_unfold_fabs_v4f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB26_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0 -; CHECK-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vandps 4096(%rdi,%rax), %xmm0, %xmm1 +; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB26_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -929,12 +929,12 @@ ; CHECK-LABEL: bcast_unfold_fabs_v8f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB27_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0 -; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax) +; CHECK-NEXT: vpandq 8192(%rdi,%rax), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB27_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -966,12 +966,12 @@ ; CHECK-LABEL: bcast_unfold_fabs_v4f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB28_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0 -; CHECK-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax) +; CHECK-NEXT: vandps 8192(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB28_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1039,12 +1039,12 @@ ; CHECK-LABEL: bcast_unfold_fadd_v16f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB30_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm0 -; CHECK-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vaddps 4096(%rdi,%rax), %zmm0, %zmm1 +; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB30_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1073,12 +1073,12 @@ ; CHECK-LABEL: bcast_unfold_fadd_v8f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB31_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm0 -; CHECK-NEXT: vaddps {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax) +; CHECK-NEXT: vaddps 4096(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB31_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1107,12 +1107,12 @@ ; CHECK-LABEL: bcast_unfold_fadd_v4f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB32_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm0 -; CHECK-NEXT: vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vaddps 4096(%rdi,%rax), %xmm0, %xmm1 +; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB32_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1140,12 +1140,12 @@ ; CHECK-LABEL: bcast_unfold_fadd_v8f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB33_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm0 -; CHECK-NEXT: vaddpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax) +; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %zmm0, %zmm1 +; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB33_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1174,12 +1174,12 @@ ; CHECK-LABEL: bcast_unfold_fadd_v4f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB34_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm0 -; CHECK-NEXT: vaddpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax) +; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB34_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1241,12 +1241,12 @@ ; CHECK-LABEL: bcast_unfold_fmul_v16f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB36_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm0 -; CHECK-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vmulps 4096(%rdi,%rax), %zmm0, %zmm1 +; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB36_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1275,12 +1275,12 @@ ; CHECK-LABEL: bcast_unfold_fmul_v8f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB37_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm0 -; CHECK-NEXT: vmulps {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax) +; CHECK-NEXT: vmulps 4096(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB37_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1309,12 +1309,12 @@ ; CHECK-LABEL: bcast_unfold_fmul_v4f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB38_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm0 -; CHECK-NEXT: vmulps {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vmulps 4096(%rdi,%rax), %xmm0, %xmm1 +; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB38_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1342,12 +1342,12 @@ ; CHECK-LABEL: bcast_unfold_fmul_v8f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB39_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm0 -; CHECK-NEXT: vmulpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax) +; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %zmm0, %zmm1 +; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB39_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1376,12 +1376,12 @@ ; CHECK-LABEL: bcast_unfold_fmul_v4f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB40_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm0 -; CHECK-NEXT: vmulpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax) +; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB40_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1443,12 +1443,13 @@ ; CHECK-LABEL: bcast_unfold_fdiv_v16f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB42_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm0 -; CHECK-NEXT: vdivps {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 +; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm1 +; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB42_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1477,12 +1478,13 @@ ; CHECK-LABEL: bcast_unfold_fdiv_v8f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB43_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm0 -; CHECK-NEXT: vdivps {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 +; CHECK-NEXT: vdivps %ymm0, %ymm1, %ymm1 +; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB43_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1511,12 +1513,13 @@ ; CHECK-LABEL: bcast_unfold_fdiv_v4f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB44_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm0 -; CHECK-NEXT: vdivps {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vdivps %xmm0, %xmm1, %xmm1 +; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB44_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1544,12 +1547,13 @@ ; CHECK-LABEL: bcast_unfold_fdiv_v8f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB45_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm0 -; CHECK-NEXT: vdivpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 +; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm1 +; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB45_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1578,12 +1582,13 @@ ; CHECK-LABEL: bcast_unfold_fdiv_v4f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB46_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm0 -; CHECK-NEXT: vdivpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vdivpd %ymm0, %ymm1, %ymm1 +; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB46_1 ; CHECK-NEXT: # %bb.2: # %bb9