Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10042,9 +10042,12 @@ } // In addition to the cases handle just above, we need to prevent - // allocas from moving below a stacksave. The stackrestore case - // is currently thought to be conservatism. - if (isa(BundleMember->Inst)) { + // allocas and loads/stores from moving below a stacksave or a + // stackrestore. Avoiding moving allocas below stackrestore is currently + // thought to be conservatism. Moving loads/stores below a stackrestore + // can lead to incorrect code. + if (isa(BundleMember->Inst) || + BundleMember->Inst->mayReadOrWriteMemory()) { for (Instruction *I = BundleMember->Inst->getNextNode(); I != ScheduleEnd; I = I->getNextNode()) { if (!match(I, m_Intrinsic()) && Index: llvm/test/Transforms/SLPVectorizer/X86/stackrestore-dependence.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/X86/stackrestore-dependence.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=slp-vectorizer -slp-threshold=-999 -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -S < %s | FileCheck %s + +; The test checks that loads should not be moved below a stackrestore boundary. +define void @stackrestore1(ptr %out) { +; CHECK-LABEL: @stackrestore1( +; CHECK-NEXT: [[STACK:%.*]] = call ptr @llvm.stacksave() +; CHECK-NEXT: [[LOCAL_ALLOCA:%.*]] = alloca [16 x i8], align 4 +; CHECK-NEXT: store <4 x float> , ptr [[LOCAL_ALLOCA]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[LOCAL_ALLOCA]], align 4 +; CHECK-NEXT: call void @llvm.stackrestore(ptr [[STACK]]) +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: store <4 x float> [[SHUFFLE]], ptr [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %stack = call ptr @llvm.stacksave() + %local_alloca = alloca [16 x i8], align 4 + store float 0x3FF3333340000000, ptr %local_alloca, align 4 + %addr1 = getelementptr inbounds i8, ptr %local_alloca, i64 4 + store float 0x3FF3333340000000, ptr %addr1, align 4 + %addr2 = getelementptr inbounds i8, ptr %local_alloca, i64 8 + store float 0x3FF3333340000000, ptr %addr2, align 4 + %addr3 = getelementptr inbounds i8, ptr %local_alloca, i64 12 + store float 0x3FF3333340000000, ptr %addr3, align 4 + %val0 = load float, ptr %local_alloca, align 4 + %val1 = load float, ptr %addr1, align 4 + %val2 = load float, ptr %addr2, align 4 + %val3 = load float, ptr %addr3, align 4 + call void @llvm.stackrestore(i8* %stack) + %outaddr2 = getelementptr inbounds float, ptr %out, i64 2 + store float %val0, ptr %outaddr2, align 4 + %outaddr3 = getelementptr inbounds float, ptr %out, i64 3 + store float %val1, ptr %outaddr3, align 4 + store float %val2, ptr %out + %outaddr1 = getelementptr inbounds float, ptr %out, i64 1 + store float %val3, ptr %outaddr1, align 4 + ret void +} + +declare i8* @llvm.stacksave() +declare void @llvm.stackrestore(i8*)