diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -40,6 +40,7 @@ #include "llvm/ADT/PriorityWorklist.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Triple.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/CaptureTracking.h" @@ -72,6 +73,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/IR/PredIteratorCache.h" #include "llvm/InitializePasses.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -1333,8 +1335,24 @@ return false; } return true; - } else - return TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) == + } else if (const ExtractElementInst* EEI = dyn_cast(&I)) { + + const VectorType* VectorOperandType = EEI->getVectorOperandType(); + if (isa(VectorOperandType) || isa(VectorOperandType)) { + Triple triple(I.getParent()->getParent()->getParent()->getTargetTriple()); + // For an ExtractElementInst on AArch64, we cannot simply use getUserCost because + // currently it optimistically assumes that extracting element at index 0 is free. + // However, the actual cost depend on the user. When the element is used + // as an integer, a `fmov` is needed. + // + // FIXME: Update cost estimation in `AArch64TTIImpl::getVectorInstrCost` so + // extracting one scalar element as integer gives a non-zero cost. + if (triple.isAArch64() && EEI->getOperand(0)->getType()->isIntOrIntVectorTy()) + return false; + } + } + + return TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) == TargetTransformInfo::TCC_Free; } diff --git a/llvm/test/Transforms/LICM/AArch64/extract-element.ll b/llvm/test/Transforms/LICM/AArch64/extract-element.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LICM/AArch64/extract-element.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -licm -mtriple aarch64-linux-gnu -S < %s | FileCheck %s + +source_filename = "func.cpp" +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +; Test that LICM detects that `extractelement` is not a free operation and thereby +; doesn't sink it into the exiting block. +define i1 @func(ptr %0, i64 %1) { +; CHECK-LABEL: @func( +; CHECK-NEXT: br label [[TMP3:%.*]] +; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[TMP27:%.*]], [[TMP26:%.*]] ] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[TMP1:%.*]] +; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP28:%.*]] +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i8> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = sext <16 x i1> [[TMP9]] to <16 x i8> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = lshr <8 x i16> [[TMP11]], +; CHECK-NEXT: [[TMP13:%.*]] = trunc <8 x i16> [[TMP12]] to <8 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], -1 +; CHECK-NEXT: br i1 [[TMP16]], label [[TMP26]], label [[TMP17:%.*]] +; CHECK: 17: +; CHECK-NEXT: [[TMP18:%.*]] = phi i64 [ [[TMP15]], [[TMP6]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi i64 [ [[TMP4]], [[TMP6]] ] +; CHECK-NEXT: [[TMP20:%.*]] = phi i1 [ [[TMP5]], [[TMP6]] ] +; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[TMP18]], -1 +; CHECK-NEXT: [[TMP22:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[TMP21]], i1 false) +; CHECK-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 2 +; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[TMP23]], [[TMP19]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp uge i64 [[TMP24]], [[TMP1]] +; CHECK-NEXT: br label [[TMP30:%.*]] +; CHECK: 26: +; CHECK-NEXT: [[TMP27]] = add i64 [[TMP4]], 16 +; CHECK-NEXT: br label [[TMP3]] +; CHECK: 28: +; CHECK-NEXT: [[TMP29:%.*]] = phi i1 [ [[TMP5]], [[TMP3]] ] +; CHECK-NEXT: br label [[TMP30]] +; CHECK: 30: +; CHECK-NEXT: [[TMP31:%.*]] = phi i1 [ [[TMP20]], [[TMP17]] ], [ [[TMP29]], [[TMP28]] ] +; CHECK-NEXT: [[TMP32:%.*]] = phi i1 [ [[TMP25]], [[TMP17]] ], [ undef, [[TMP28]] ] +; CHECK-NEXT: [[TMP33:%.*]] = xor i1 [[TMP31]], true +; CHECK-NEXT: [[TMP34:%.*]] = select i1 [[TMP33]], i1 true, i1 [[TMP32]] +; CHECK-NEXT: ret i1 [[TMP34]] +; + br label %3 + +3: ; preds = %26, %2 + %4 = phi i64 [ 0, %2 ], [ %27, %26 ] + %5 = icmp ult i64 %4, %1 + br i1 %5, label %6, label %28 + +6: ; preds = %3 + %7 = getelementptr inbounds i8, ptr %0, i64 %4 + %8 = load <16 x i8>, ptr %7, align 1 + %9 = icmp eq <16 x i8> %8, zeroinitializer + %10 = sext <16 x i1> %9 to <16 x i8> + %11 = bitcast <16 x i8> %10 to <8 x i16> + %12 = lshr <8 x i16> %11, + %13 = trunc <8 x i16> %12 to <8 x i8> + %14 = bitcast <8 x i8> %13 to <1 x i64> + %15 = extractelement <1 x i64> %14, i64 0 + %16 = icmp eq i64 %15, -1 + br i1 %16, label %26, label %17 + +17: ; preds = %6 + %18 = phi i64 [ %15, %6 ] + %19 = phi i64 [ %4, %6 ] + %20 = phi i1 [ %5, %6 ] + %21 = xor i64 %18, -1 + %22 = tail call i64 @llvm.cttz.i64(i64 %21, i1 false) + %23 = lshr i64 %22, 2 + %24 = add i64 %23, %19 + %25 = icmp uge i64 %24, %1 + br label %30 + +26: ; preds = %6 + %27 = add i64 %4, 16 + br label %3 + +28: ; preds = %3 + %29 = phi i1 [ %5, %3 ] + br label %30 + +30: ; preds = %28, %17 + %31 = phi i1 [ %20, %17 ], [ %29, %28 ] + %32 = phi i1 [ %25, %17 ], [ undef, %28 ] + %33 = xor i1 %31, true + %34 = select i1 %33, i1 true, i1 %32 + ret i1 %34 +} + +declare i64 @llvm.cttz.i64(i64, i1 immarg)