Index: llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp =================================================================== --- llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp +++ llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp @@ -123,17 +123,20 @@ // T=VLDRH.16; A=VMOVNB T; B=VMOVNT T // But those VMOVL may be folded into a VMULL. - // But expensive extends/truncs are always good to remove. - for (auto *E : Exts) - if (!isa(E->getOperand(0))) { + // But expensive extends/truncs are always good to remove. FPExts always + // involve extra VCVT's so are always considered to be cheap. + for (auto *E : Exts) { + if (isa(E) || !isa(E->getOperand(0))) { LLVM_DEBUG(dbgs() << "Beneficial due to " << *E << "\n"); return true; } - for (auto *T : Truncs) + } + for (auto *T : Truncs) { if (T->hasOneUse() && !isa(*T->user_begin())) { LLVM_DEBUG(dbgs() << "Beneficial due to " << *T << "\n"); return true; } + } // Otherwise, we know we have a load(ext), see if any of the Extends are a // vmull. This is a simple heuristic and certainly not perfect. @@ -172,6 +175,7 @@ switch (I->getOpcode()) { // Truncs case Instruction::Trunc: + case Instruction::FPTrunc: if (Truncs.count(I)) continue; Truncs.insert(I); @@ -181,6 +185,7 @@ // Extend leafs case Instruction::SExt: case Instruction::ZExt: + case Instruction::FPExt: if (Exts.count(I)) continue; for (auto *Use : I->users()) @@ -196,6 +201,9 @@ case Instruction::LShr: case Instruction::Shl: case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::FAdd: + case Instruction::FMul: case Instruction::Select: if (Ops.count(I)) continue; @@ -297,9 +305,11 @@ LLVM_DEBUG(dbgs() << "Replacing ext " << *I << "\n"); Builder.SetInsertPoint(I); Value *Shuffle = Builder.CreateShuffleVector(I->getOperand(0), LeafMask); + bool FPext = isa(I); bool Sext = isa(I); - Value *Ext = Sext ? Builder.CreateSExt(Shuffle, I->getType()) - : Builder.CreateZExt(Shuffle, I->getType()); + Value *Ext = FPext ? Builder.CreateFPExt(Shuffle, I->getType()) + : Sext ? Builder.CreateSExt(Shuffle, I->getType()) + : Builder.CreateZExt(Shuffle, I->getType()); I->replaceAllUsesWith(Ext); LLVM_DEBUG(dbgs() << " with " << *Shuffle << "\n"); } Index: llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll +++ llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll @@ -360,16 +360,14 @@ ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q1, [r0, #8] -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1, #8] -; CHECK-NEXT: vldrh.u32 q1, [r0], #16 -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vldrh.u16 q1, [r0], #16 +; CHECK-NEXT: vcvtb.f32.f16 q2, q1 +; CHECK-NEXT: vcvtt.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q2, q2, q0 ; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1], #16 +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vcvtt.f16.f32 q2, q1 +; CHECK-NEXT: vstrb.8 q2, [r1], #16 ; CHECK-NEXT: le lr, .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -412,26 +410,22 @@ ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q1, [r0, #24] -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1, #24] -; CHECK-NEXT: vldrh.u32 q1, [r0, #16] -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1, #16] -; CHECK-NEXT: vldrh.u32 q1, [r0, #8] -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vldrh.u16 q1, [r0, #16] +; CHECK-NEXT: vcvtb.f32.f16 q2, q1 +; CHECK-NEXT: vcvtt.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q2, q2, q0 ; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1, #8] -; CHECK-NEXT: vldrh.u32 q1, [r0], #32 -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vcvtt.f16.f32 q2, q1 +; CHECK-NEXT: vldrh.u16 q1, [r0], #32 +; CHECK-NEXT: vstrh.16 q2, [r1, #16] +; CHECK-NEXT: vcvtb.f32.f16 q2, q1 +; CHECK-NEXT: vcvtt.f32.f16 q1, q1 +; CHECK-NEXT: vmul.f32 q2, q2, q0 ; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1], #32 +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vcvtt.f16.f32 q2, q1 +; CHECK-NEXT: vstrh.16 q2, [r1], #32 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc}