diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11361,6 +11361,23 @@ if (ISD::isConstantSplatVectorAllZeros(Mask.getNode())) return Chain; + // Remove a masked store if base pointers and masks are equal. + if (MaskedStoreSDNode *MST1 = dyn_cast(Chain)) { + if (MST->isUnindexed() && MST->isSimple() && MST1->isUnindexed() && + MST1->isSimple() && MST1->getBasePtr() == Ptr && + !MST->getBasePtr().isUndef() && + ((Mask == MST1->getMask() && MST->getMemoryVT().getStoreSize() == + MST1->getMemoryVT().getStoreSize()) || + ISD::isConstantSplatVectorAllOnes(Mask.getNode())) && + TypeSize::isKnownLE(MST1->getMemoryVT().getStoreSize(), + MST->getMemoryVT().getStoreSize())) { + CombineTo(MST1, MST1->getChain()); + if (N->getOpcode() != ISD::DELETED_NODE) + AddToWorklist(N); + return SDValue(N, 0); + } + } + // If this is a masked load with an all ones mask, we can use a unmasked load. // FIXME: Can we do this for indexed, compressing, or truncating stores? if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MST->isUnindexed() && diff --git a/llvm/test/CodeGen/AArch64/sve-dead-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-dead-masked-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-dead-masked-store.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s + +define void @dead_masked_store( %val, ptr %a, %mask) { +; CHECK-LABEL: dead_masked_store: +; CHECK: // %bb.0: +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv4i32( %val, ptr %a, i32 4, %mask) + call void @llvm.masked.store.nxv4i32( %val, ptr %a, i32 4, %mask) + ret void +} + +define void @dead_masked_store_alltrue_same( %val, ptr %a, %mask) { +; CHECK-LABEL: dead_masked_store_alltrue_same: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %alltrue.ins = insertelement poison, i1 true, i32 0 + %alltrue = shufflevector %alltrue.ins, poison, zeroinitializer + call void @llvm.masked.store.nxv4i32( %val, ptr %a, i32 4, %mask) + call void @llvm.masked.store.nxv4i32( %val, ptr %a, i32 4, %alltrue) + ret void +} + +define void @dead_masked_store_alltrue_bigger( %val, %val1, ptr %a, %mask) { +; CHECK-LABEL: dead_masked_store_alltrue_bigger: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1w { z1.s }, p0, [x0] +; CHECK-NEXT: ret + %alltrue.ins = insertelement poison, i1 true, i32 0 + %alltrue = shufflevector %alltrue.ins, poison, zeroinitializer + call void @llvm.masked.store.nxv4i16( %val, ptr %a, i32 4, %mask) + call void @llvm.masked.store.nxv4i32( %val1, ptr %a, i32 4, %alltrue) + ret void +} + +define void @dead_masked_store_alltrue_smaller( %val, %val1, ptr %a, %mask) { +; CHECK-LABEL: dead_masked_store_alltrue_smaller: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1h { z1.s }, p1, [x0] +; CHECK-NEXT: ret + %alltrue.ins = insertelement poison, i1 true, i32 0 + %alltrue = shufflevector %alltrue.ins, poison, zeroinitializer + call void @llvm.masked.store.nxv4i32( %val, ptr %a, i32 4, %mask) + call void @llvm.masked.store.nxv4i16( %val1, ptr %a, i32 4, %alltrue) + ret void +} + +define void @dead_masked_store_same_mask_smaller_type( %val, %val1, ptr %a, %mask) { +; CHECK-LABEL: dead_masked_store_same_mask_smaller_type: +; CHECK: // %bb.0: +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1h { z1.s }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv4i32( %val, ptr %a, i32 4, %mask) + call void @llvm.masked.store.nxv4i16( %val1, ptr %a, i32 4, %mask) + ret void +} + +define void @dead_masked_store_same_mask_bigger_type( %val, %val1, ptr %a, %mask) { +; CHECK-LABEL: dead_masked_store_same_mask_bigger_type: +; CHECK: // %bb.0: +; CHECK-NEXT: st1h { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv4i16( %val, ptr %a, i32 4, %mask) + call void @llvm.masked.store.nxv4i32( %val1, ptr %a, i32 4, %mask) + ret void +} + +declare void @llvm.masked.store.nxv4i16(, *, i32, ) +declare void @llvm.masked.store.nxv4i32(, *, i32, ) diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -5564,7 +5564,6 @@ ; ; AVX1OR2-LABEL: PR11210: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vmaskmovps %xmm0, %xmm2, (%rdi) ; AVX1OR2-NEXT: vmaskmovps %xmm1, %xmm2, (%rdi) ; AVX1OR2-NEXT: retq ; @@ -5572,12 +5571,10 @@ ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm3, %k0 +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5585,15 +5582,13 @@ ; AVX512VLDQ-LABEL: PR11210: ; AVX512VLDQ: ## %bb.0: ; AVX512VLDQ-NEXT: vpmovd2m %xmm2, %k1 -; AVX512VLDQ-NEXT: vmovups %xmm0, (%rdi) {%k1} ; AVX512VLDQ-NEXT: vmovups %xmm1, (%rdi) {%k1} ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: PR11210: ; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512VLBW-NEXT: vpcmpgtd %xmm2, %xmm3, %k1 -; AVX512VLBW-NEXT: vmovups %xmm0, (%rdi) {%k1} +; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpcmpgtd %xmm2, %xmm0, %k1 ; AVX512VLBW-NEXT: vmovups %xmm1, (%rdi) {%k1} ; AVX512VLBW-NEXT: retq ; @@ -5601,7 +5596,6 @@ ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: vpmovd2m %xmm2, %k1 -; X86-AVX512-NEXT: vmovups %xmm0, (%eax) {%k1} ; X86-AVX512-NEXT: vmovups %xmm1, (%eax) {%k1} ; X86-AVX512-NEXT: retl %bc = bitcast <2 x i64> %mask to <4 x i32>