Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -18871,10 +18871,13 @@ return DAG.getStore(Chain, dl, DataToCompress, Addr, MemIntr->getMemOperand()); + SDValue LoadAddress = DAG.getLoad(VT, SDLoc(Op), Chain, Addr, + MemIntr->getPointerInfo(), MemIntr->getAlignment()); SDValue Compressed = getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), - Mask, DAG.getUNDEF(VT), Subtarget, DAG); - return DAG.getStore(Chain, dl, Compressed, Addr, + Mask, LoadAddress, Subtarget, DAG); + LoadSDNode* LoadInst = dyn_cast(LoadAddress); + return DAG.getStore(LoadInst->getChain(), dl, Compressed, Addr, MemIntr->getMemOperand()); } case TRUNCATE_TO_MEM_VI8: Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -7219,7 +7219,7 @@ (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", [(store (_.VT (vselect _.KRCWM:$mask, - (_.VT (X86compress _.RC:$src)), _.ImmAllZerosV)), + (_.VT (X86compress _.RC:$src)), (_.VT (load addr:$dst)))), addr:$dst)]>, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; } Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -893,6 +893,38 @@ ret <4 x i32> %res } +@xmm = common global <4 x i32> zeroinitializer, align 16 +@k8 = common global i8 0, align 1 +@res = common global <4 x i32> zeroinitializer, align 16 + +define i32 @compr11() { +; CHECK-LABEL: compr11: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movq _xmm@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] +; CHECK-NEXT: ## fixup A - offset: 3, value: _xmm@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load +; CHECK-NEXT: vmovdqa32 (%rax), %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6f,0x00] +; CHECK-NEXT: movq _k8@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] +; CHECK-NEXT: ## fixup A - offset: 3, value: _k8@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load +; CHECK-NEXT: movzbl (%rax), %eax ## encoding: [0x0f,0xb6,0x00] +; CHECK-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; CHECK-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0] +; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9] +; CHECK-NEXT: vmovdqa32 %xmm0, -{{[0-9]+}}(%rsp) ## encoding: [0x62,0xf1,0x7d,0x08,0x7f,0x84,0x24,0xd8,0xff,0xff,0xff] +; CHECK-NEXT: vmovdqa32 %xmm1, -{{[0-9]+}}(%rsp) ## encoding: [0x62,0xf1,0x7d,0x08,0x7f,0x8c,0x24,0xe8,0xff,0xff,0xff] +; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] +entry: + %.compoundliteral = alloca <2 x i64>, align 16 + %res = alloca <4 x i32>, align 16 + %a0 = load <4 x i32>, <4 x i32>* @xmm, align 16 + %a2 = load i8, i8* @k8, align 1 + %a21 = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %a0, <4 x i32> zeroinitializer, i8 %a2) #2 + store volatile <4 x i32> %a21, <4 x i32>* %res, align 16 + store <2 x i64> zeroinitializer, <2 x i64>* %.compoundliteral, align 16 + ret i32 0 +} + + declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask) ; Expand