diff --git a/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp --- a/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp @@ -163,14 +163,26 @@ // The result vector Value *VResult = Src0; + // Shorten the way if the mask is a vector of constants. + // Create a build_vector pattern, with loads/undefs as necessary and then + // shuffle blend with the pass through value. if (isConstantIntVector(Mask)) { + VResult = UndefValue::get(VecType); + SmallVector ShuffleMask(VectorWidth, UndefMaskElem); for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - if (cast(Mask)->getAggregateElement(Idx)->isNullValue()) - continue; - Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx); - LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Gep, AdjustedAlignVal); - VResult = Builder.CreateInsertElement(VResult, Load, Idx); + Value *InsertElt; + if (cast(Mask)->getAggregateElement(Idx)->isNullValue()) { + InsertElt = UndefValue::get(EltTy); + ShuffleMask[Idx] = Idx + VectorWidth; + } else { + Value *Gep = + Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx); + InsertElt = Builder.CreateAlignedLoad(EltTy, Gep, AdjustedAlignVal); + ShuffleMask[Idx] = Idx; + } + VResult = Builder.CreateInsertElement(VResult, InsertElt, Idx); } + VResult = Builder.CreateShuffleVector(VResult, Src0, ShuffleMask); CI->replaceAllUsesWith(VResult); CI->eraseFromParent(); return; diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -6117,20 +6117,15 @@ define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) { ; SSE2-LABEL: mload_constmask_v4f32: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSE2-NEXT: movups (%rdi), %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE42-LABEL: mload_constmask_v4f32: ; SSE42: ## %bb.0: -; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE42-NEXT: movups (%rdi), %xmm1 ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v4f32: @@ -6213,20 +6208,20 @@ define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) { ; SSE2-LABEL: mload_constmask_v4i32: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: mload_constmask_v4i32: ; SSE42: ## %bb.0: -; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0 -; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0 -; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm0 +; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1 +; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1 +; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: retq ; ; AVX1-LABEL: mload_constmask_v4i32: @@ -6295,21 +6290,18 @@ define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) { ; SSE2-LABEL: mload_constmask_v8f32: ; SSE2: ## %bb.0: +; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm3[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: mload_constmask_v8f32: ; SSE42: ## %bb.0: -; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE42-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v8f32: @@ -6348,9 +6340,11 @@ define <8 x float> @mload_constmask_v8f32_zero(<8 x float>* %addr, <8 x float> %dst) { ; SSE2-LABEL: mload_constmask_v8f32_zero: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -6437,12 +6431,10 @@ define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) { ; SSE2-LABEL: mload_constmask_v8i32: ; SSE2: ## %bb.0: +; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm3[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] @@ -6451,10 +6443,13 @@ ; ; SSE42-LABEL: mload_constmask_v8i32: ; SSE42: ## %bb.0: -; SSE42-NEXT: pinsrd $0, (%rdi), %xmm0 -; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0 -; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0 -; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm1 +; SSE42-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm2 +; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm2 +; SSE42-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v8i32: @@ -6498,8 +6493,10 @@ ; ; SSE42-LABEL: mload_constmask_v4i64: ; SSE42: ## %bb.0: -; SSE42-NEXT: pinsrq $0, (%rdi), %xmm0 -; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm1 +; SSE42-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; SSE42-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE42-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v4i64: diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll --- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll +++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll @@ -44,7 +44,8 @@ define <2 x i64> @scalarize_v2i64_zero_mask(<2 x i64>* %p, <2 x i64> %passthru) { ; CHECK-LABEL: @scalarize_v2i64_zero_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64* -; CHECK-NEXT: ret <2 x i64> [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> undef, <2 x i64> [[PASSTHRU:%.*]], <2 x i32> +; CHECK-NEXT: ret <2 x i64> [[TMP2]] ; %ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> , <2 x i64> %passthru) ret <2 x i64> %ret @@ -55,8 +56,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64* ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP3]], i64 1 -; CHECK-NEXT: ret <2 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[PASSTHRU:%.*]], <2 x i32> +; CHECK-NEXT: ret <2 x i64> [[TMP5]] ; %ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> , <2 x i64> %passthru) ret <2 x i64> %ret diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll --- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll +++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll @@ -43,7 +43,8 @@ define <2 x i64> @scalarize_v2i64_zero_mask(<2 x i64>* %p, <2 x i64> %passthru) { ; CHECK-LABEL: @scalarize_v2i64_zero_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64* -; CHECK-NEXT: ret <2 x i64> [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> undef, <2 x i64> [[PASSTHRU:%.*]], <2 x i32> +; CHECK-NEXT: ret <2 x i64> [[TMP2]] ; %ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> , <2 x i64> %passthru) ret <2 x i64> %ret @@ -54,8 +55,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64* ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP3]], i64 1 -; CHECK-NEXT: ret <2 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[PASSTHRU:%.*]], <2 x i32> +; CHECK-NEXT: ret <2 x i64> [[TMP5]] ; %ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> , <2 x i64> %passthru) ret <2 x i64> %ret