diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -51,6 +51,11 @@ #define DEBUG_TYPE "scalarizer" +static cl::opt ScalarizeVariableInsertExtract( + "scalarize-variable-insert-extract", cl::init(true), cl::Hidden, + cl::desc("Allow the scalarizer pass to scalarize " + "insertelement/extractelement with variable index")); + // This is disabled by default because having separate loads and stores // makes it more likely that the -combiner-alias-analysis limits will be // reached. @@ -192,6 +197,7 @@ bool visitGetElementPtrInst(GetElementPtrInst &GEPI); bool visitCastInst(CastInst &CI); bool visitBitCastInst(BitCastInst &BCI); + bool visitInsertElementInst(InsertElementInst &IEI); bool visitShuffleVectorInst(ShuffleVectorInst &SVI); bool visitPHINode(PHINode &PHI); bool visitLoadInst(LoadInst &LI); @@ -740,6 +746,36 @@ return true; } +bool ScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) { + if (!ScalarizeVariableInsertExtract) + return false; + + VectorType *VT = dyn_cast(IEI.getType()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + IRBuilder<> Builder(&IEI); + Scatterer Op0 = scatter(&IEI, IEI.getOperand(0)); + Value *NewElt = IEI.getOperand(1); + + Value *InsIdx = IEI.getOperand(2); + if (isa(InsIdx)) + return false; + + ValueVector Res; + Res.resize(NumElems); + + for (unsigned I = 0; I < NumElems; ++I) { + Res[I] = Builder.CreateSelect( + Builder.CreateICmpEQ(InsIdx, ConstantInt::get(InsIdx->getType(), I), + InsIdx->getName() + ".is." + Twine(I)), + NewElt, Op0[I], IEI.getName() + ".i" + Twine(I)); + } + gather(&IEI, Res); + return true; +} + bool ScalarizerVisitor::visitShuffleVectorInst(ShuffleVectorInst &SVI) { VectorType *VT = dyn_cast(SVI.getType()); if (!VT) diff --git a/llvm/test/Transforms/Scalarizer/basic.ll b/llvm/test/Transforms/Scalarizer/basic.ll --- a/llvm/test/Transforms/Scalarizer/basic.ll +++ b/llvm/test/Transforms/Scalarizer/basic.ll @@ -363,19 +363,37 @@ ret void } -; Test that variable inserts aren't scalarized. +; Test that variable inserts are scalarized by default define void @f12(<4 x i32> *%dest, <4 x i32> *%src, i32 %index) { -; CHECK: @f12( -; CHECK: %val1 = insertelement <4 x i32> %val0, i32 1, i32 %index -; CHECK-DAG: %val1.i0 = extractelement <4 x i32> %val1, i32 0 -; CHECK-DAG: %val1.i1 = extractelement <4 x i32> %val1, i32 1 -; CHECK-DAG: %val1.i2 = extractelement <4 x i32> %val1, i32 2 -; CHECK-DAG: %val1.i3 = extractelement <4 x i32> %val1, i32 3 -; CHECK-DAG: %val2.i0 = shl i32 1, %val1.i0 -; CHECK-DAG: %val2.i1 = shl i32 2, %val1.i1 -; CHECK-DAG: %val2.i2 = shl i32 3, %val1.i2 -; CHECK-DAG: %val2.i3 = shl i32 4, %val1.i3 -; CHECK: ret void +; CHECK-LABEL: @f12( +; CHECK: %dest.i0 = bitcast <4 x i32>* %dest to i32* +; CHECK: %dest.i1 = getelementptr i32, i32* %dest.i0, i32 1 +; CHECK: %dest.i2 = getelementptr i32, i32* %dest.i0, i32 2 +; CHECK: %dest.i3 = getelementptr i32, i32* %dest.i0, i32 3 +; CHECK: %src.i0 = bitcast <4 x i32>* %src to i32* +; CHECK: %val0.i0 = load i32, i32* %src.i0, align 16 +; CHECK: %src.i1 = getelementptr i32, i32* %src.i0, i32 1 +; CHECK: %val0.i1 = load i32, i32* %src.i1, align 4 +; CHECK: %src.i2 = getelementptr i32, i32* %src.i0, i32 2 +; CHECK: %val0.i2 = load i32, i32* %src.i2, align 8 +; CHECK: %src.i3 = getelementptr i32, i32* %src.i0, i32 3 +; CHECK: %val0.i3 = load i32, i32* %src.i3, align 4 +; CHECK: %index.is.0 = icmp eq i32 %index, 0 +; CHECK: %val1.i0 = select i1 %index.is.0, i32 1, i32 %val0.i0 +; CHECK: %index.is.1 = icmp eq i32 %index, 1 +; CHECK: %val1.i1 = select i1 %index.is.1, i32 1, i32 %val0.i1 +; CHECK: %index.is.2 = icmp eq i32 %index, 2 +; CHECK: %val1.i2 = select i1 %index.is.2, i32 1, i32 %val0.i2 +; CHECK: %index.is.3 = icmp eq i32 %index, 3 +; CHECK: %val1.i3 = select i1 %index.is.3, i32 1, i32 %val0.i3 +; CHECK: %val2.i0 = shl i32 1, %val1.i0 +; CHECK: %val2.i1 = shl i32 2, %val1.i1 +; CHECK: %val2.i2 = shl i32 3, %val1.i2 +; CHECK: %val2.i3 = shl i32 4, %val1.i3 +; CHECK: store i32 %val2.i0, i32* %dest.i0, align 16 +; CHECK: store i32 %val2.i1, i32* %dest.i1, align 4 +; CHECK: store i32 %val2.i2, i32* %dest.i2, align 8 +; CHECK: store i32 %val2.i3, i32* %dest.i3, align 4 %val0 = load <4 x i32> , <4 x i32> *%src %val1 = insertelement <4 x i32> %val0, i32 1, i32 %index %val2 = shl <4 x i32> , %val1 diff --git a/llvm/test/Transforms/Scalarizer/variable-insertelement.ll b/llvm/test/Transforms/Scalarizer/variable-insertelement.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Scalarizer/variable-insertelement.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -scalarizer -dce -S | FileCheck --check-prefixes=ALL,DEFAULT %s +; RUN: opt %s -scalarizer -scalarize-variable-insert-extract=false -dce -S | FileCheck --check-prefixes=ALL,OFF %s +; RUN: opt %s -scalarizer -scalarize-variable-insert-extract=true -dce -S | FileCheck --check-prefixes=ALL,DEFAULT,ON %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; Test that variable inserts are scalarized. +define <4 x i32> @f1(<4 x i32> %src, i32 %val, i32 %index) { +; DEFAULT-LABEL: @f1( +; DEFAULT-NEXT: [[INDEX_IS_0:%.*]] = icmp eq i32 [[INDEX:%.*]], 0 +; DEFAULT-NEXT: [[SRC_I0:%.*]] = extractelement <4 x i32> [[SRC:%.*]], i32 0 +; DEFAULT-NEXT: [[RES_I0:%.*]] = select i1 [[INDEX_IS_0]], i32 [[VAL:%.*]], i32 [[SRC_I0]] +; DEFAULT-NEXT: [[INDEX_IS_1:%.*]] = icmp eq i32 [[INDEX]], 1 +; DEFAULT-NEXT: [[SRC_I1:%.*]] = extractelement <4 x i32> [[SRC]], i32 1 +; DEFAULT-NEXT: [[RES_I1:%.*]] = select i1 [[INDEX_IS_1]], i32 [[VAL]], i32 [[SRC_I1]] +; DEFAULT-NEXT: [[INDEX_IS_2:%.*]] = icmp eq i32 [[INDEX]], 2 +; DEFAULT-NEXT: [[SRC_I2:%.*]] = extractelement <4 x i32> [[SRC]], i32 2 +; DEFAULT-NEXT: [[RES_I2:%.*]] = select i1 [[INDEX_IS_2]], i32 [[VAL]], i32 [[SRC_I2]] +; DEFAULT-NEXT: [[INDEX_IS_3:%.*]] = icmp eq i32 [[INDEX]], 3 +; DEFAULT-NEXT: [[SRC_I3:%.*]] = extractelement <4 x i32> [[SRC]], i32 3 +; DEFAULT-NEXT: [[RES_I3:%.*]] = select i1 [[INDEX_IS_3]], i32 [[VAL]], i32 [[SRC_I3]] +; DEFAULT-NEXT: [[RES_UPTO0:%.*]] = insertelement <4 x i32> undef, i32 [[RES_I0]], i32 0 +; DEFAULT-NEXT: [[RES_UPTO1:%.*]] = insertelement <4 x i32> [[RES_UPTO0]], i32 [[RES_I1]], i32 1 +; DEFAULT-NEXT: [[RES_UPTO2:%.*]] = insertelement <4 x i32> [[RES_UPTO1]], i32 [[RES_I2]], i32 2 +; DEFAULT-NEXT: [[RES:%.*]] = insertelement <4 x i32> [[RES_UPTO2]], i32 [[RES_I3]], i32 3 +; DEFAULT-NEXT: ret <4 x i32> [[RES]] +; +; OFF-LABEL: @f1( +; OFF-NEXT: [[RES:%.*]] = insertelement <4 x i32> [[SRC:%.*]], i32 [[VAL:%.*]], i32 [[INDEX:%.*]] +; OFF-NEXT: ret <4 x i32> [[RES]] +; + %res = insertelement <4 x i32> %src, i32 %val, i32 %index + ret <4 x i32> %res +}