diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -76,6 +76,8 @@ bool isIntDivCheap(EVT VT, AttributeList Attr) const override; bool isVectorLoadExtDesirable(SDValue ExtVal) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + bool shouldSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -32,6 +32,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsWebAssembly.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" @@ -833,6 +834,30 @@ return isa(GV) ? false : TargetLowering::isOffsetFoldingLegal(GA); } +bool WebAssemblyTargetLowering::shouldSinkOperands( + Instruction *I, SmallVectorImpl &Ops) const { + using namespace llvm::PatternMatch; + + if (!I->getType()->isVectorTy() || !I->isShift()) + return false; + + Value *V = I->getOperand(1); + // We dont need to sink constant splat. + if (dyn_cast(V)) + return false; + + if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()), + m_Value(), m_ZeroMask()))) { + // Sink insert + Ops.push_back(&cast(V)->getOperandUse(0)); + // Sink shuffle + Ops.push_back(&I->getOperandUse(1)); + return true; + } + + return false; +} + EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, EVT VT) const { diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s + +; Test that SIMD shifts can be lowered correctly even when shift +; values are exported from outside blocks. + +target triple = "wasm32-unknown-unknown" + +define void @shl_loop(ptr %a, i8 %shift, i32 %count) { +; CHECK-LABEL: shl_loop: +; CHECK: .functype shl_loop (i32, i32, i32) -> () +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: .LBB0_1: # %body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: loop # label0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load 0:p2align=0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shl +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.set 0 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.const -1 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.tee 2 +; CHECK-NEXT: i32.eqz +; CHECK-NEXT: br_if 0 # 0: up to label0 +; CHECK-NEXT: # %bb.2: # %exit +; CHECK-NEXT: end_loop +; CHECK-NEXT: # fallthrough-return +entry: + %t1 = insertelement <16 x i8> undef, i8 %shift, i32 0 + %vshift = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer + br label %body +body: + %out = phi ptr [%a, %entry], [%b, %body] + %i = phi i32 [0, %entry], [%next, %body] + %v = load <16 x i8>, ptr %out, align 1 + %r = shl <16 x i8> %v, %vshift + %b = getelementptr inbounds i8, ptr %out, i32 16 + store <16 x i8> %r, ptr %b + %next = add i32 %i, 1 + %i.cmp = icmp eq i32 %next, %count + br i1 %i.cmp, label %body, label %exit +exit: + ret void +} + +; Test that SIMD shifts can be lowered correctly when shift value +; is a phi inside loop body. + +define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) { +; CHECK-LABEL: shl_phi_loop: +; CHECK: .functype shl_phi_loop (i32, i32, i32) -> () +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: .LBB1_1: # %body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: loop # label1: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load 0:p2align=0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shl +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32.and +; CHECK-NEXT: local.set 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.set 0 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.const -1 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.tee 2 +; CHECK-NEXT: i32.eqz +; CHECK-NEXT: br_if 0 # 0: up to label1 +; CHECK-NEXT: # %bb.2: # %exit +; CHECK-NEXT: end_loop +; CHECK-NEXT: # fallthrough-return +entry: + br label %body +body: + %out = phi ptr [%a, %entry], [%b, %body] + %i = phi i32 [0, %entry], [%next, %body] + %t1 = phi i8 [%shift, %entry], [%sand, %body] + %t2 = insertelement <16 x i8> undef, i8 %t1, i32 0 + %vshift = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32> zeroinitializer + %v = load <16 x i8>, ptr %out, align 1 + %r = shl <16 x i8> %v, %vshift + %b = getelementptr inbounds i8, ptr %out, i32 16 + store <16 x i8> %r, ptr %b + %sand = and i8 %t1, 1 + %next = add i32 %i, 1 + %i.cmp = icmp eq i32 %next, %count + br i1 %i.cmp, label %body, label %exit +exit: + ret void +}