diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -403,6 +403,7 @@ bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI); bool optimizeSwitchInst(SwitchInst *SI); bool optimizeExtractElementInst(Instruction *Inst); + bool optimizeInsertValueInst(InsertValueInst *Inst); bool dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT); bool fixupDbgValue(Instruction *I); bool placeDbgValues(Function &F); @@ -7417,6 +7418,38 @@ return false; } +bool CodeGenPrepare::optimizeInsertValueInst(InsertValueInst *Inst) { + Value *RealV, *ImgV; + if (!match(Inst, m_InsertValue<1>(m_InsertValue<0>(m_Value(), m_Value(RealV)), + m_Value(ImgV)))) + return false; + + Value *Op0R, *Op0I, *Op1R, *Op1I; + if (!match(RealV, m_FSub(m_FMul(m_Value(Op0R), m_Value(Op1R)), + m_FMul(m_Value(Op0I), m_Value(Op1I))))) + return false; + + if (!match(ImgV, m_c_FAdd(m_c_FMul(m_Specific(Op0R), m_Specific(Op1I)), + m_c_FMul(m_Specific(Op1R), m_Specific(Op0I))))) + return false; + + auto VecTy = FixedVectorType::get(Op0R->getType(), 2); + IRBuilder<> Builder(Inst->getContext()); + Builder.SetInsertPoint(cast(Inst->getOperand(0))); + Value *Vec0Op0 = + Builder.CreateInsertElement(UndefValue::get(VecTy), Op0R, 0ull); + Value *Vec0Op1 = Builder.CreateInsertElement(Vec0Op0, Op0I, 1ull); + Value *Vec1Op0 = + Builder.CreateInsertElement(UndefValue::get(VecTy), Op1R, 0ull); + Value *Vec1Op1 = Builder.CreateInsertElement(Vec1Op0, Op1I, 1ull); + Value *Res = Builder.CreateIntrinsic(Intrinsic::complex_multiply, VecTy, + {Vec0Op1, Vec1Op1}); + + Inst->setOperand(1, Builder.CreateExtractElement(Res, 1)); + cast(Inst->getOperand(0)) + ->setOperand(1, Builder.CreateExtractElement(Res, 0ull)); + return true; +} /// For the instruction sequence of store below, F and I values /// are bundled together as an i64 value before being stored into memory. /// Sometimes it is more efficient to generate separate stores for F and I, @@ -7939,6 +7972,8 @@ return optimizeExtractElementInst(cast(I)); case Instruction::Br: return optimizeBranch(cast(I), *TLI); + case Instruction::InsertValue: + return optimizeInsertValueInst(cast(I)); } return false; diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/complex-multiply.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/complex-multiply.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/complex-multiply.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -codegenprepare -S %s | FileCheck %s + +target triple = "arm64-apple-ios" + +%complex_t = type { float, float } + +define %complex_t @complex_multiply_float(%complex_t* %x, %complex_t* %y) local_unnamed_addr #0 { +; CHECK-LABEL: @complex_multiply_float( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_R_ADDR1:%.*]] = bitcast %complex_t* [[X:%.*]] to float* +; CHECK-NEXT: [[X_R:%.*]] = load float, float* [[X_R_ADDR1]], align 4 +; CHECK-NEXT: [[X_I_ADDR:%.*]] = getelementptr inbounds [[COMPLEX_T:%.*]], %complex_t* [[X]], i64 0, i32 1 +; CHECK-NEXT: [[X_I:%.*]] = load float, float* [[X_I_ADDR]], align 4 +; CHECK-NEXT: [[Y_R_ADDR2:%.*]] = bitcast %complex_t* [[Y:%.*]] to float* +; CHECK-NEXT: [[Y_R:%.*]] = load float, float* [[Y_R_ADDR2]], align 4 +; CHECK-NEXT: [[Y_I_ADDR:%.*]] = getelementptr inbounds [[COMPLEX_T]], %complex_t* [[Y]], i64 0, i32 1 +; CHECK-NEXT: [[Y_I:%.*]] = load float, float* [[Y_I_ADDR]], align 4 +; CHECK-NEXT: [[MUL_I:%.*]] = fmul fast float [[Y_R]], [[X_R]] +; CHECK-NEXT: [[MUL4_I:%.*]] = fmul fast float [[Y_I]], [[X_I]] +; CHECK-NEXT: [[MUL5_I:%.*]] = fmul fast float [[Y_I]], [[X_R]] +; CHECK-NEXT: [[MUL6_I:%.*]] = fmul fast float [[Y_R]], [[X_I]] +; CHECK-NEXT: [[SUB_I:%.*]] = fsub fast float [[MUL_I]], [[MUL4_I]] +; CHECK-NEXT: [[ADD_I:%.*]] = fadd fast float [[MUL5_I]], [[MUL6_I]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> undef, float [[Y_R]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[Y_I]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> undef, float [[X_R]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[X_I]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.complex.multiply.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i64 0 +; CHECK-NEXT: [[RES_R:%.*]] = insertvalue [[COMPLEX_T]] undef, float [[TMP6]], 0 +; CHECK-NEXT: [[RES_I:%.*]] = insertvalue [[COMPLEX_T]] [[RES_R]], float [[TMP5]], 1 +; CHECK-NEXT: ret [[COMPLEX_T]] [[RES_I]] +; +entry: + %x.r.addr = getelementptr inbounds %complex_t, %complex_t* %x, i64 0, i32 0 + %x.r = load float, float* %x.r.addr, align 4 + %x.i.addr = getelementptr inbounds %complex_t, %complex_t* %x, i64 0, i32 1 + %x.i = load float, float* %x.i.addr, align 4 + %y.r.addr = getelementptr inbounds %complex_t, %complex_t* %y, i64 0, i32 0 + %y.r = load float, float* %y.r.addr, align 4 + %y.i.addr = getelementptr inbounds %complex_t, %complex_t* %y, i64 0, i32 1 + %y.i = load float, float* %y.i.addr, align 4 + %mul.i = fmul fast float %y.r, %x.r + %mul4.i = fmul fast float %y.i, %x.i + %mul5.i = fmul fast float %y.i, %x.r + %mul6.i = fmul fast float %y.r, %x.i + %sub.i = fsub fast float %mul.i, %mul4.i + %add.i = fadd fast float %mul5.i, %mul6.i + %res.r = insertvalue %complex_t undef, float %sub.i, 0 + %res.i = insertvalue %complex_t %res.r, float %add.i, 1 + ret %complex_t %res.i +}