[ARM] Enable mixed types in ARM CGP

Previously, during the search, all values had to have the same 'TypeSize', which is equal to number of bits of the integer type of the icmp operand. All values in the tree had to match this size; meaning that, if we searched from i16, we wouldn't accept i8s. A change in type size requires zext and truncs to perform the casts so, to allow mixed narrow types, the handling of these instructions is now slightly different: - we allow casts if their result or operand is <= TypeSize. - zexts are sinks if their result > TypeSize. - truncs are still sinks if their operand == TypeSize. - truncs are still sources if their result == TypeSize. The transformation bails on finding an icmp that operates on data smaller than the current TypeSize. Differential Revision: https://reviews.llvm.org/D54108 llvm-svn: 346480
llvm · Nov 9, 2018 · 08979cd · 08979cd
1 parent 453ba91
commit 08979cd
Showing 4 changed files with 298 additions and 70 deletions.
diff --git a/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp b/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -126,7 +126,7 @@ class IRPromoter {
                    SmallPtrSetImpl<Instruction*> &SafeToPromote);
   void TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
                      SmallPtrSetImpl<Instruction*> &Sinks);
-  void Cleanup(SmallPtrSetImpl<Instruction*> &Sinks);
+  void Cleanup(SmallPtrSetImpl<Value*> &Visited);
 
 public:
   IRPromoter(Module *M) : M(M), Ctx(M->getContext()),
@@ -180,6 +180,18 @@ static bool generateSignBits(Value *V) {
          Opc == Instruction::SRem;
 }
 
+static bool EqualTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize;
+}
+
+static bool LessOrEqualTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() <= ARMCodeGenPrepare::TypeSize;
+}
+
+static bool GreaterThanTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() > ARMCodeGenPrepare::TypeSize;
+}
+
 /// Some instructions can use 8- and 16-bit operands, and we don't need to
 /// promote anything larger. We disallow booleans to make life easier when
 /// dealing with icmps but allow any other integer that is <= 16 bits. Void
@@ -194,11 +206,10 @@ static bool isSupportedType(Value *V) {
   if (auto *Ld = dyn_cast<LoadInst>(V))
     Ty = cast<PointerType>(Ld->getPointerOperandType())->getElementType();
 
-  const IntegerType *IntTy = dyn_cast<IntegerType>(Ty);
-  if (!IntTy)
+  if (!isa<IntegerType>(Ty))
     return false;
 
-  return IntTy->getBitWidth() == ARMCodeGenPrepare::TypeSize;
+  return LessOrEqualTypeSize(V);
 }
 
 /// Return true if the given value is a source in the use-def chain, producing
@@ -221,7 +232,7 @@ static bool isSource(Value *V) {
   else if (auto *Call = dyn_cast<CallInst>(V))
     return Call->hasRetAttr(Attribute::AttrKind::ZExt);
   else if (auto *Trunc = dyn_cast<TruncInst>(V))
-    return isSupportedType(Trunc);
+    return EqualTypeSize(Trunc);
   return false;
 }
 
@@ -232,18 +243,15 @@ static bool isSink(Value *V) {
   // TODO The truncate also isn't actually necessary because we would already
   // proved that the data value is kept within the range of the original data
   // type.
-  auto UsesNarrowValue = [](Value *V) {
-    return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize;
-  };
 
   if (auto *Store = dyn_cast<StoreInst>(V))
-    return UsesNarrowValue(Store->getValueOperand());
+    return LessOrEqualTypeSize(Store->getValueOperand());
   if (auto *Return = dyn_cast<ReturnInst>(V))
-    return UsesNarrowValue(Return->getReturnValue());
+    return LessOrEqualTypeSize(Return->getReturnValue());
   if (auto *Trunc = dyn_cast<TruncInst>(V))
-    return UsesNarrowValue(Trunc->getOperand(0));
+    return EqualTypeSize(Trunc->getOperand(0));
   if (auto *ZExt = dyn_cast<ZExtInst>(V))
-    return UsesNarrowValue(ZExt->getOperand(0));
+    return GreaterThanTypeSize(ZExt);
   if (auto *ICmp = dyn_cast<ICmpInst>(V))
     return ICmp->isSigned();
 
@@ -649,36 +657,37 @@ void IRPromoter::TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
       }
     }
   }
-
 }
 
-void IRPromoter::Cleanup(SmallPtrSetImpl<Instruction*> &Sinks) {
-  // Some zext sinks will now have become redundant, along with their trunc
-  // operands, so remove them.
-  for (auto I : Sinks) {
-    if (auto *ZExt = dyn_cast<ZExtInst>(I)) {
-      if (ZExt->getDestTy() != ExtTy)
-        continue;
+void IRPromoter::Cleanup(SmallPtrSetImpl<Value*> &Visited) {
+  // Some zexts will now have become redundant, along with their trunc
+  // operands, so remove them
+  for (auto V : Visited) {
+    if (!isa<ZExtInst>(V))
+      continue;
 
-      Value *Src = ZExt->getOperand(0);
-      if (ZExt->getSrcTy() == ZExt->getDestTy()) {
-        LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary zext\n");
-        ReplaceAllUsersOfWith(ZExt, Src);
-        InstsToRemove.push_back(ZExt);
-        continue;
-      }
+    auto ZExt = cast<ZExtInst>(V);
+    if (ZExt->getDestTy() != ExtTy)
+      continue;
 
-      // For any truncs that we insert to handle zexts, we can replace the
-      // result of the zext with the input to the trunc.
-      if (NewInsts.count(Src) && isa<TruncInst>(Src)) {
-        auto *Trunc = cast<TruncInst>(Src);
-        assert(Trunc->getOperand(0)->getType() == ExtTy &&
-               "expected inserted trunc to be operating on i32");
-        LLVM_DEBUG(dbgs() << "ARM CGP: Replacing zext with trunc operand: "
-                   << *Trunc->getOperand(0));
-        ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0));
-        InstsToRemove.push_back(ZExt);
-      }
+    Value *Src = ZExt->getOperand(0);
+    if (ZExt->getSrcTy() == ZExt->getDestTy()) {
+      LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary cast.\n");
+      ReplaceAllUsersOfWith(ZExt, Src);
+      InstsToRemove.push_back(ZExt);
+      continue;
+    }
+
+    // For any truncs that we insert to handle zexts, we can replace the
+    // result of the zext with the input to the trunc.
+    if (NewInsts.count(Src) && isa<TruncInst>(Src)) {
+      auto *Trunc = cast<TruncInst>(Src);
+      assert(Trunc->getOperand(0)->getType() == ExtTy &&
+             "expected inserted trunc to be operating on i32");
+      LLVM_DEBUG(dbgs() << "ARM CGP: Replacing zext with trunc operand: "
+                 << *Trunc->getOperand(0));
+      ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0));
+      InstsToRemove.push_back(ZExt);
     }
   }
 
@@ -728,27 +737,25 @@ void IRPromoter::Mutate(Type *OrigTy,
 
   // Finally, remove unecessary zexts and truncs, delete old instructions and
   // clear the data structures.
-  Cleanup(Sinks);
+  Cleanup(Visited);
 
-  LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete:\n");
-  LLVM_DEBUG(dbgs();
-             for (auto *V : Sources)
-               V->dump();
-             for (auto *I : NewInsts)
-               I->dump();
-             for (auto *V : Visited) {
-               if (!Sources.count(V))
-                 V->dump();
-              });
+  LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete\n");
 }
 
 /// We accept most instructions, as well as Arguments and ConstantInsts. We
 /// Disallow casts other than zext and truncs and only allow calls if their
 /// return value is zeroext. We don't allow opcodes that can introduce sign
 /// bits.
 bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
-  if (isa<ICmpInst>(V))
-    return true;
+  if (auto *I = dyn_cast<ICmpInst>(V)) {
+    // Now that we allow small types than TypeSize, only allow icmp of
+    // TypeSize because they will require a trunc to be legalised.
+    // TODO: Allow icmp of smaller types, and calculate at the end
+    // whether the transform would be beneficial.
+    if (isa<PointerType>(I->getOperand(0)->getType()))
+      return true;
+    return EqualTypeSize(I->getOperand(0));
+  }
 
   // Memory instructions
   if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V))
@@ -766,12 +773,11 @@ bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
       isa<LoadInst>(V))
     return isSupportedType(V);
 
-  // Truncs can be either sources or sinks.
-  if (auto *Trunc = dyn_cast<TruncInst>(V))
-    return isSupportedType(Trunc) || isSupportedType(Trunc->getOperand(0));
+  if (isa<SExtInst>(V))
+    return false;
 
-  if (isa<CastInst>(V) && !isa<SExtInst>(V))
-    return isSupportedType(cast<CastInst>(V)->getOperand(0));
+  if (auto *Cast = dyn_cast<CastInst>(V))
+    return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0));
 
   // Special cases for calls as we need to check for zeroext
   // TODO We should accept calls even if they don't have zeroext, as they can
@@ -901,13 +907,17 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
     // Calls can be both sources and sinks.
     if (isSink(V))
       Sinks.insert(cast<Instruction>(V));
+
     if (isSource(V))
       Sources.insert(V);
-    else if (auto *I = dyn_cast<Instruction>(V)) {
-      // Visit operands of any instruction visited.
-      for (auto &U : I->operands()) {
-        if (!AddLegalInst(U))
-          return false;
+
+    if (!isSink(V) && !isSource(V)) {
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        // Visit operands of any instruction visited.
+        for (auto &U : I->operands()) {
+          if (!AddLegalInst(U))
+            return false;
+        }
       }
     }
 
@@ -973,6 +983,8 @@ bool ARMCodeGenPrepare::runOnFunction(Function &F) {
         if (CI.isSigned() || !isa<IntegerType>(CI.getOperand(0)->getType()))
           continue;
 
+        LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n");
+
         for (auto &Op : CI.operands()) {
           if (auto *I = dyn_cast<Instruction>(Op))
             MadeChange |= TryToPromote(I);

diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-calls.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
@@ -22,12 +22,10 @@ define i16 @test_call(i8 zeroext %arg) {
   ret i16 %conv
 }
 
-; Test that the transformation bails when it finds that i16 is larger than i8.
-; TODO: We should be able to remove the uxtb in these cases.
 ; CHECK-LABEL: promote_i8_sink_i16_1
 ; CHECK: bl dummy_i8
 ; CHECK: add{{.*}} r0, #1
-; CHECK: uxtb r0, r0
+; CHECK-NOT: uxt
 ; CHECK: cmp r0
 define i16 @promote_i8_sink_i16_1(i8 zeroext %arg0, i16 zeroext %arg1, i16 zeroext %arg2) {
   %call = tail call zeroext i8 @dummy_i8(i8 %arg0)

diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-casts.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
@@ -340,8 +340,8 @@ declare i32 @dummy(i32, i32)
 @sh1 = hidden local_unnamed_addr global i16 0, align 2
 @d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2
 
-; CHECK-LABEL: two_stage_zext_trunc_mix
-; CHECK-NOT: uxt
+; CHECK-COMMON-LABEL: two_stage_zext_trunc_mix
+; CHECK-COMMON-NOT: uxt
 define i8* @two_stage_zext_trunc_mix(i32* %this, i32 %__pos1, i32 %__n1, i32** %__str, i32 %__pos2, i32 %__n2) {
 entry:
   %__size_.i.i.i.i = bitcast i32** %__str to i8*
@@ -363,3 +363,222 @@ entry:
   %res = select i1 %tobool.i.i.i.i.i,  i8* %7, i8* %8
   ret i8* %res
 }
+
+; CHECK-COMMON-LABEL: search_through_zext_1
+; CHECK-COMMON-NOT: uxt
+define i8 @search_through_zext_1(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c) {
+entry:
+  %add = add nuw i8 %a, %b
+  %conv = zext i8 %add to i16
+  %cmp = icmp ult i16 %conv, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %a
+  %conv2 = zext i8 %sub to i16
+  %cmp2 = icmp ugt i16 %conv2, %c
+  %res = select i1 %cmp2, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; TODO: We should be able to remove the uxtb here. The transform fails because
+; the icmp ugt uses an i32, which is too large... but this doesn't matter
+; because it won't be writing a large value to a register as a result.
+; CHECK-COMMON-LABEL: search_through_zext_2
+; CHECK-COMMON: uxtb
+; CHECK-COMMON: uxtb
+define i8 @search_through_zext_2(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c, i32 %d) {
+entry:
+  %add = add nuw i8 %a, %b
+  %conv = zext i8 %add to i16
+  %cmp = icmp ult i16 %conv, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %a
+  %conv2 = zext i8 %sub to i32
+  %cmp2 = icmp ugt i32 %conv2, %d
+  %res = select i1 %cmp2, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; TODO: We should be able to remove the uxtb here as all the calculations are
+; performed on i8s. The promotion of i8 to i16 and then the later truncation
+; results in the uxtb.
+; CHECK-COMMON-LABEL: search_through_zext_3
+; CHECK-COMMON: uxtb
+; CHECK-COMMON: uxtb
+define i8 @search_through_zext_3(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c, i32 %d) {
+entry:
+  %add = add nuw i8 %a, %b
+  %conv = zext i8 %add to i16
+  %cmp = icmp ult i16 %conv, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %conv to i8
+  %sub = sub nuw i8 %b, %trunc
+  %conv2 = zext i8 %sub to i32
+  %cmp2 = icmp ugt i32 %conv2, %d
+  %res = select i1 %cmp2, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; TODO: We should be able to remove the uxt that gets introduced for %conv2
+; CHECK-COMMON-LABEL: search_through_zext_cmp
+; CHECK-COMMON: uxt
+define i8 @search_through_zext_cmp(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c) {
+entry:
+  %cmp = icmp ne i8 %a, %b
+  %conv = zext i1 %cmp to i16
+  %cmp1 = icmp ult i16 %conv, %c
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %a
+  %conv2 = zext i8 %sub to i16
+  %cmp3 = icmp ugt i16 %conv2, %c
+  %res = select i1 %cmp3, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; CHECK-COMMON-LABEL: search_through_zext_load
+; CHECK-COMMON-NOT: uxt
+define i8 @search_through_zext_load(i8* %a, i8 zeroext %b, i16 zeroext %c) {
+entry:
+  %load = load i8, i8* %a
+  %conv = zext i8 %load to i16
+  %cmp1 = icmp ult i16 %conv, %c
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %load
+  %conv2 = zext i8 %sub to i16
+  %cmp3 = icmp ugt i16 %conv2, %c
+  %res = select i1 %cmp3, i8 %load, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; CHECK-COMMON-LABEL: trunc_sink_less_than
+; CHECK-COMMON-NOT: uxth
+; CHECK-COMMON: cmp
+; CHECK-COMMON: uxtb
+define i16 @trunc_sink_less_than_cmp(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, 1
+  %cmp2 = icmp ugt i8 %trunc, %add
+  %res = select i1 %cmp2, i16 %a, i16 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i16 [ 0, %entry ], [ %res, %if.then ]
+  ret i16 %retval
+}
+
+; TODO: We should be able to remove the uxth introduced to handle %sub
+; CHECK-COMMON-LABEL: trunc_sink_less_than_arith
+; CHECK-COMMON: uxth
+; CHECK-COMMON: cmp
+; CHECK-COMMON: uxtb
+define i16 @trunc_sink_less_than_arith(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  %cmp2 = icmp ugt i8 %e, %add
+  %res = select i1 %cmp2, i16 %a, i16 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i16 [ 0, %entry ], [ %res, %if.then ]
+  ret i16 %retval
+}
+
+; CHECK-COMMON-LABEL: trunc_sink_less_than_store
+; CHECK-COMMON-NOT: uxt
+; CHECK-COMMON: cmp
+; CHECK-COMMON-NOT: uxt
+define i16 @trunc_sink_less_than_store(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8* %e) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  store i8 %add, i8* %e
+  br label %if.end
+
+if.end:
+  %retval = phi i16 [ 0, %entry ], [ %sub, %if.then ]
+  ret i16 %retval
+}
+
+; CHECK-COMMON-LABEL: trunc_sink_less_than_ret
+; CHECK-COMMON-NOT: uxt
+define i8 @trunc_sink_less_than_ret(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %add, %if.then ]
+  ret i8 %retval
+}
+
+; CHECK-COMMON-LABEL: trunc_sink_less_than_zext_ret
+; CHECK-COMMON-NOT: uxt
+; CHECK-COMMON: cmp
+; CHECK-COMMON: uxtb
+define zeroext i8 @trunc_sink_less_than_zext_ret(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %add, %if.then ]
+  ret i8 %retval
+}
diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll
@@ -11,18 +11,17 @@
 ; CHECK-NODSP: sxtb
 ; CHECK-NODSP: cmp
 
-; CHECK-DSP: add
-; CHECK-DSP: uxtb
+; CHECK-DSP: uadd8
+; CHECK-DSP: sub
 ; CHECK-DSP: cmp
 ; CHECK-DSP: sxtb
-; CHECK-DSP: sub
 ; CHECK-DSP: sxtb
 ; CHECK-DSP: cmp
 
 ; CHECK-DSP-IMM: uadd8 [[ADD:r[0-9]+]],
 ; CHECK-DSP-IMM: cmp [[ADD]],
+; CHECK-DSP-IMM: subs [[SUB:r[0-9]+]],
 ; CHECK-DSP-IMM: sxtb [[SEXT0:r[0-9]+]], [[ADD]]
-; CHECK-DSP-IMM: usub8 [[SUB:r[0-9]+]],
 ; CHECK-DSP-IMM: sxtb [[SEXT1:r[0-9]+]], [[SUB]]
 ; CHECK-DSP-IMM: cmp [[SEXT1]], [[SEXT0]]
 define i8 @eq_sgt(i8* %x, i8 *%y, i8 zeroext %z) {