diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1233,6 +1233,11 @@ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal); } + + if (Subtarget.isISA3_1()) { + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); + } } if (Subtarget.pairedVectorMemops()) { @@ -10041,14 +10046,34 @@ "Should only be called for ISD::INSERT_VECTOR_ELT"); ConstantSDNode *C = dyn_cast(Op.getOperand(2)); - // We have legal lowering for constant indices but not for variable ones. - if (!C) - return SDValue(); EVT VT = Op.getValueType(); SDLoc dl(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); + SDValue V3 = Op.getOperand(2); + + if (Subtarget.isISA3_1()) { + // On P10, we have legal lowering for constant and variable indices for + // integer vectors. + if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || + VT == MVT::v2i64) + return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, V2, V3); + // For f32 and f64 vectors, we have legal lowering for variable indices. + // For f32 we also have legal lowering when the element is loaded from + // memory. + if (VT == MVT::v4f32 || VT == MVT::v2f64) { + if (!C || (VT == MVT::v4f32 && dyn_cast(V2))) + return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, V2, V3); + return SDValue(); + } + } + + // Before P10, we have legal lowering for constant indices but not for + // variable ones. + if (!C) + return SDValue(); + // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types. if (VT == MVT::v8i16 || VT == MVT::v16i8) { SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2); diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -26,6 +26,9 @@ def SDT_PPCxxmfacc : SDTypeProfile<1, 1, [ SDTCisVT<0, v512i1>, SDTCisVT<1, v512i1> ]>; +def SDT_PPCVecInsertElt : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<3> +]>; //===----------------------------------------------------------------------===// // ISA 3.1 specific PPCISD nodes. @@ -39,6 +42,7 @@ def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx, []>; def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>; +def PPCvecinsertelt : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsertElt, []>; //===----------------------------------------------------------------------===// @@ -2665,3 +2669,99 @@ (XXBLENDVD $A, $B, $C)>; } +def InsertEltShift { + dag Sub32Left0 = (EXTRACT_SUBREG $rB, sub_32); + dag Sub32Left1 = (RLWINM (EXTRACT_SUBREG $rB, sub_32), 1, 0, 30); + dag Sub32Left2 = (RLWINM (EXTRACT_SUBREG $rB, sub_32), 2, 0, 29); + dag Left3 = (RLWINM8 $rB, 3, 0, 28); +} + +let Predicates = [IsISA3_1, HasVSX, IsLittleEndian] in { + // Indexed vector insert element + def : Pat<(v16i8 (PPCvecinsertelt v16i8:$vDi, i32:$rA, i64:$rB)), + (VINSBRX $vDi, InsertEltShift.Sub32Left0, $rA)>; + def : Pat<(v8i16 (PPCvecinsertelt v8i16:$vDi, i32:$rA, i64:$rB)), + (VINSHRX $vDi, InsertEltShift.Sub32Left1, $rA)>; + def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, i64:$rB)), + (VINSWRX $vDi, InsertEltShift.Sub32Left2, $rA)>; + def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, i64:$rB)), + (VINSDRX $vDi, InsertEltShift.Left3, $rA)>; + + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, f32:$A, i64:$rB)), + (VINSWRX $vDi, InsertEltShift.Sub32Left2, Bitcast.FltToInt)>; + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)), + (VINSWRX $vDi, InsertEltShift.Sub32Left2, (LWZ memri:$rA))>; + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)), + (VINSWRX $vDi, InsertEltShift.Sub32Left2, (PLWZ memri34:$rA))>; + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)), + (VINSWRX $vDi, InsertEltShift.Sub32Left2, (LWZX memrr:$rA))>; + + def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, f64:$A, i64:$rB)), + (VINSDRX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>; + def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)), + (VINSDRX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>; + def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)), + (VINSDRX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>; + def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)), + (VINSDRX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>; + + // Immediate vector insert element + foreach i = [0, 1, 2, 3] in { + def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, (i64 i))), + (VINSW $vDi, !mul(!sub(3, i), 4), $rA)>; + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), (i64 i))), + (VINSW $vDi, !mul(!sub(3, i), 4), (LWZ memri:$rA))>; + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), (i64 i))), + (VINSW $vDi, !mul(!sub(3, i), 4), (PLWZ memri34:$rA))>; + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), (i64 i))), + (VINSW $vDi, !mul(!sub(3, i), 4), (LWZX memrr:$rA))>; + } + foreach i = [0, 1] in + def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, (i64 i))), + (VINSD $vDi, !mul(!sub(1, i), 8), $rA)>; +} + +let Predicates = [IsISA3_1, HasVSX, IsBigEndian] in { + // Indexed vector insert element + def : Pat<(v16i8 (PPCvecinsertelt v16i8:$vDi, i32:$rA, i64:$rB)), + (VINSBLX $vDi, InsertEltShift.Sub32Left0, $rA)>; + def : Pat<(v8i16 (PPCvecinsertelt v8i16:$vDi, i32:$rA, i64:$rB)), + (VINSHLX $vDi, InsertEltShift.Sub32Left1, $rA)>; + def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, i64:$rB)), + (VINSWLX $vDi, InsertEltShift.Sub32Left2, $rA)>; + def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, i64:$rB)), + (VINSDLX $vDi, InsertEltShift.Left3, $rA)>; + + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, f32:$A, i64:$rB)), + (VINSWLX $vDi, InsertEltShift.Sub32Left2, Bitcast.FltToInt)>; + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)), + (VINSWLX $vDi, InsertEltShift.Sub32Left2, (LWZ memri:$rA))>; + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)), + (VINSWLX $vDi, InsertEltShift.Sub32Left2, (PLWZ memri34:$rA))>; + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)), + (VINSWLX $vDi, InsertEltShift.Sub32Left2, (LWZX memrr:$rA))>; + + def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, f64:$A, i64:$rB)), + (VINSDLX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>; + def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)), + (VINSDLX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>; + def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)), + (VINSDLX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>; + def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)), + (VINSDLX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>; + + // Immediate vector insert element + foreach i = [0, 1, 2, 3] in { + def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, (i64 i))), + (VINSW $vDi, !mul(i, 4), $rA)>; + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), (i64 i))), + (VINSW $vDi, !mul(i, 4), (LWZ memri:$rA))>; + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), (i64 i))), + (VINSW $vDi, !mul(i, 4), (PLWZ memri34:$rA))>; + def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), (i64 i))), + (VINSW $vDi, !mul(i, 4), (LWZX memrr:$rA))>; + } + foreach i = [0, 1] in + def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, (i64 i))), + (VINSD $vDi, !mul(i, 8), $rA)>; +} diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -1810,6 +1810,14 @@ } //----------------------------- DAG Definitions ------------------------------// + +// Output dag used to bitcast f32 to i32 and f64 to i64 +def Bitcast { + dag FltToInt = (i32 (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI (XSCVDPSPN $A), + (XSCVDPSPN $A), 3), sub_64))); + dag DblToLong = (i64 (MFVSRD $A)); +} + def FpMinMax { dag F32Min = (COPY_TO_REGCLASS (XSMINDP (COPY_TO_REGCLASS $A, VSFRC), (COPY_TO_REGCLASS $B, VSFRC)), @@ -3345,10 +3353,8 @@ let Predicates = [HasVSX, HasDirectMove] in { // bitconvert f32 -> i32 // (convert to 32-bit fp single, shift right 1 word, move to GPR) -def : Pat<(i32 (bitconvert f32:$S)), - (i32 (MFVSRWZ (EXTRACT_SUBREG - (XXSLDWI (XSCVDPSPN $S), (XSCVDPSPN $S), 3), - sub_64)))>; +def : Pat<(i32 (bitconvert f32:$A)), Bitcast.FltToInt>; + // bitconvert i32 -> f32 // (move to FPR, shift left 1 word, convert to 64-bit fp single) def : Pat<(f32 (bitconvert i32:$A)), @@ -3357,8 +3363,7 @@ // bitconvert f64 -> i64 // (move to GPR, nothing else needed) -def : Pat<(i64 (bitconvert f64:$S)), - (i64 (MFVSRD $S))>; +def : Pat<(i64 (bitconvert f64:$A)), Bitcast.DblToLong>; // bitconvert i64 -> f64 // (move to FPR, nothing else needed) diff --git a/llvm/test/CodeGen/PowerPC/pcrel-linkeropt.ll b/llvm/test/CodeGen/PowerPC/pcrel-linkeropt.ll --- a/llvm/test/CodeGen/PowerPC/pcrel-linkeropt.ll +++ b/llvm/test/CodeGen/PowerPC/pcrel-linkeropt.ll @@ -42,11 +42,11 @@ ; CHECK-NEXT: pld r4, output8@got@pcrel(0), 1 ; CHECK-NEXT: .reloc .Lpcrel0-8,R_PPC64_PCREL_OPT,.-(.Lpcrel0-8) ; CHECK-NEXT: lbz r3, 0(r3) +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr ; In this test the stb r3, 0(r4) cannot be optimized because it ; uses the register r3 and that register is defined by lbz r3, 0(r3) ; which is defined between the pld and the stb. -; CHECK-NEXT: stb r3, 0(r4) -; CHECK-NEXT: blr entry: %0 = load i8, i8* @input8, align 1 store i8 %0, i8* @output8, align 1 @@ -61,11 +61,11 @@ ; CHECK-NEXT: pld r4, output16@got@pcrel(0), 1 ; CHECK-NEXT: .reloc .Lpcrel1-8,R_PPC64_PCREL_OPT,.-(.Lpcrel1-8) ; CHECK-NEXT: lhz r3, 0(r3) +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr ; In this test the sth r3, 0(r4) cannot be optimized because it ; uses the register r3 and that register is defined by lhz r3, 0(r3) ; which is defined between the pld and the sth. -; CHECK-NEXT: sth r3, 0(r4) -; CHECK-NEXT: blr entry: %0 = load i16, i16* @input16, align 2 store i16 %0, i16* @output16, align 2 @@ -165,11 +165,10 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pld r3, inputVi32@got@pcrel(0), 1 ; CHECK-NEXT: li r4, 45 -; CHECK-NEXT: mtfprwz f1, r4 -; CHECK-NEXT: lxvx vs0, 0, r3 +; CHECK-NEXT: lxvx v2, 0, r3 ; CHECK-NEXT: pld r3, outputVi32@got@pcrel(0), 1 -; CHECK-NEXT: xxinsertw vs0, vs1, 8 -; CHECK-NEXT: stxvx vs0, 0, r3 +; CHECK-NEXT: vinsw v2, r4, 8 +; CHECK-NEXT: stxvx v2, 0, r3 ; CHECK-NEXT: blr entry: %0 = load <4 x i32>, <4 x i32>* @inputVi32, align 16 @@ -286,8 +285,7 @@ define dso_local void @FuncPtrCall() local_unnamed_addr #0 { ; CHECK-LABEL: FuncPtrCall: -; CHECK: .localentry FuncPtrCall, 1 -; CHECK-NEXT: # %bb.0: # %entry +; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pld r3, FuncPtrIn@got@pcrel(0), 1 ; CHECK-NEXT: .Lpcrel10: ; CHECK-NEXT: .reloc .Lpcrel10-8,R_PPC64_PCREL_OPT,.-(.Lpcrel10-8) @@ -317,8 +315,7 @@ define dso_local signext i32 @VecMultiUse() local_unnamed_addr #0 { ; CHECK-LABEL: VecMultiUse: -; CHECK: .localentry VecMultiUse, 1 -; CHECK-NEXT: # %bb.0: # %entry +; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mflr r0 ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill @@ -355,8 +352,7 @@ define dso_local signext i32 @UseAddr(i32 signext %a) local_unnamed_addr #0 { ; CHECK-LABEL: UseAddr: -; CHECK: .localentry UseAddr, 1 -; CHECK-NEXT: # %bb.0: # %entry +; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mflr r0 ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r0, 16(r1) diff --git a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll @@ -0,0 +1,740 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-P9 + +; Byte indexed + +define <16 x i8> @testByte(<16 x i8> %a, i64 %b, i64 %idx) { +; CHECK-LABEL: testByte: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vinsbrx v2, r6, r5 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testByte: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: vinsblx v2, r6, r5 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testByte: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: addi r4, r1, -16 +; CHECK-P9-NEXT: clrldi r3, r6, 60 +; CHECK-P9-NEXT: stxv v2, -16(r1) +; CHECK-P9-NEXT: stbx r5, r4, r3 +; CHECK-P9-NEXT: lxv v2, -16(r1) +; CHECK-P9-NEXT: blr +entry: + %conv = trunc i64 %b to i8 + %vecins = insertelement <16 x i8> %a, i8 %conv, i64 %idx + ret <16 x i8> %vecins +} + +; Halfword indexed + +define <8 x i16> @testHalf(<8 x i16> %a, i64 %b, i64 %idx) { +; CHECK-LABEL: testHalf: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: slwi r3, r6, 1 +; CHECK-NEXT: vinshrx v2, r3, r5 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testHalf: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: slwi r3, r6, 1 +; CHECK-BE-NEXT: vinshlx v2, r3, r5 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testHalf: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: addi r4, r1, -16 +; CHECK-P9-NEXT: rlwinm r3, r6, 1, 28, 30 +; CHECK-P9-NEXT: stxv v2, -16(r1) +; CHECK-P9-NEXT: sthx r5, r4, r3 +; CHECK-P9-NEXT: lxv v2, -16(r1) +; CHECK-P9-NEXT: blr +entry: + %conv = trunc i64 %b to i16 + %vecins = insertelement <8 x i16> %a, i16 %conv, i64 %idx + ret <8 x i16> %vecins +} + +; Word indexed + +define <4 x i32> @testWord(<4 x i32> %a, i64 %b, i64 %idx) { +; CHECK-LABEL: testWord: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: slwi r3, r6, 2 +; CHECK-NEXT: vinswrx v2, r3, r5 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testWord: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: slwi r3, r6, 2 +; CHECK-BE-NEXT: vinswlx v2, r3, r5 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testWord: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: addi r4, r1, -16 +; CHECK-P9-NEXT: rlwinm r3, r6, 2, 28, 29 +; CHECK-P9-NEXT: stxv v2, -16(r1) +; CHECK-P9-NEXT: stwx r5, r4, r3 +; CHECK-P9-NEXT: lxv v2, -16(r1) +; CHECK-P9-NEXT: blr +entry: + %conv = trunc i64 %b to i32 + %vecins = insertelement <4 x i32> %a, i32 %conv, i64 %idx + ret <4 x i32> %vecins +} + +; Word immediate + +define <4 x i32> @testWordImm(<4 x i32> %a, i64 %b) { +; CHECK-LABEL: testWordImm: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vinsw v2, r5, 8 +; CHECK-NEXT: vinsw v2, r5, 0 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testWordImm: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: vinsw v2, r5, 4 +; CHECK-BE-NEXT: vinsw v2, r5, 12 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testWordImm: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: mtfprwz f0, r5 +; CHECK-P9-NEXT: xxinsertw v2, vs0, 4 +; CHECK-P9-NEXT: xxinsertw v2, vs0, 12 +; CHECK-P9-NEXT: blr +entry: + %conv = trunc i64 %b to i32 + %vecins = insertelement <4 x i32> %a, i32 %conv, i32 1 + %vecins2 = insertelement <4 x i32> %vecins, i32 %conv, i32 3 + ret <4 x i32> %vecins2 +} + +; Doubleword indexed + +define <2 x i64> @testDoubleword(<2 x i64> %a, i64 %b, i64 %idx) { +; CHECK-LABEL: testDoubleword: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: rlwinm r3, r6, 3, 0, 28 +; CHECK-NEXT: vinsdrx v2, r3, r5 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testDoubleword: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: rlwinm r3, r6, 3, 0, 28 +; CHECK-BE-NEXT: vinsdlx v2, r3, r5 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testDoubleword: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: addi r4, r1, -16 +; CHECK-P9-NEXT: rlwinm r3, r6, 3, 28, 28 +; CHECK-P9-NEXT: stxv v2, -16(r1) +; CHECK-P9-NEXT: stdx r5, r4, r3 +; CHECK-P9-NEXT: lxv v2, -16(r1) +; CHECK-P9-NEXT: blr +entry: + %vecins = insertelement <2 x i64> %a, i64 %b, i64 %idx + ret <2 x i64> %vecins +} + +; Doubleword immediate + +define <2 x i64> @testDoublewordImm(<2 x i64> %a, i64 %b) { +; CHECK-LABEL: testDoublewordImm: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vinsd v2, r5, 0 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testDoublewordImm: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: vinsd v2, r5, 8 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testDoublewordImm: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: mtfprd f0, r5 +; CHECK-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-P9-NEXT: blr +entry: + %vecins = insertelement <2 x i64> %a, i64 %b, i32 1 + ret <2 x i64> %vecins +} + +define <2 x i64> @testDoublewordImm2(<2 x i64> %a, i64 %b) { +; CHECK-LABEL: testDoublewordImm2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vinsd v2, r5, 8 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testDoublewordImm2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: vinsd v2, r5, 0 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testDoublewordImm2: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: mtfprd f0, r5 +; CHECK-P9-NEXT: xxpermdi v2, vs0, v2, 1 +; CHECK-P9-NEXT: blr +entry: + %vecins = insertelement <2 x i64> %a, i64 %b, i32 0 + ret <2 x i64> %vecins +} + +; Float indexed + +define <4 x float> @testFloat1(<4 x float> %a, float %b, i32 zeroext %idx1) { +; CHECK-LABEL: testFloat1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xscvdpspn vs0, f1 +; CHECK-NEXT: extsw r3, r6 +; CHECK-NEXT: slwi r3, r3, 2 +; CHECK-NEXT: xxsldwi vs0, vs0, vs0, 3 +; CHECK-NEXT: mffprwz r4, f0 +; CHECK-NEXT: vinswrx v2, r3, r4 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testFloat1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xscvdpspn vs0, f1 +; CHECK-BE-NEXT: extsw r3, r6 +; CHECK-BE-NEXT: slwi r3, r3, 2 +; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; CHECK-BE-NEXT: mffprwz r4, f0 +; CHECK-BE-NEXT: vinswlx v2, r3, r4 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testFloat1: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: addi r4, r1, -16 +; CHECK-P9-NEXT: rlwinm r3, r6, 2, 28, 29 +; CHECK-P9-NEXT: stxv v2, -16(r1) +; CHECK-P9-NEXT: stfsx f1, r4, r3 +; CHECK-P9-NEXT: lxv v2, -16(r1) +; CHECK-P9-NEXT: blr +entry: + %vecins = insertelement <4 x float> %a, float %b, i32 %idx1 + ret <4 x float> %vecins +} + +define <4 x float> @testFloat2(<4 x float> %a, i8* %b, i32 zeroext %idx1, i32 zeroext %idx2) { +; CHECK-LABEL: testFloat2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lwz r3, 0(r5) +; CHECK-NEXT: extsw r4, r6 +; CHECK-NEXT: slwi r4, r4, 2 +; CHECK-NEXT: vinswrx v2, r4, r3 +; CHECK-NEXT: lwz r3, 1(r5) +; CHECK-NEXT: extsw r4, r7 +; CHECK-NEXT: slwi r4, r4, 2 +; CHECK-NEXT: vinswrx v2, r4, r3 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testFloat2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lwz r3, 0(r5) +; CHECK-BE-NEXT: extsw r4, r6 +; CHECK-BE-NEXT: slwi r4, r4, 2 +; CHECK-BE-NEXT: vinswlx v2, r4, r3 +; CHECK-BE-NEXT: lwz r3, 1(r5) +; CHECK-BE-NEXT: extsw r4, r7 +; CHECK-BE-NEXT: slwi r4, r4, 2 +; CHECK-BE-NEXT: vinswlx v2, r4, r3 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testFloat2: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lwz r3, 0(r5) +; CHECK-P9-NEXT: rlwinm r4, r6, 2, 28, 29 +; CHECK-P9-NEXT: addi r6, r1, -32 +; CHECK-P9-NEXT: stxv v2, -32(r1) +; CHECK-P9-NEXT: stwx r3, r6, r4 +; CHECK-P9-NEXT: rlwinm r4, r7, 2, 28, 29 +; CHECK-P9-NEXT: lxv vs0, -32(r1) +; CHECK-P9-NEXT: lwz r3, 1(r5) +; CHECK-P9-NEXT: addi r5, r1, -16 +; CHECK-P9-NEXT: stxv vs0, -16(r1) +; CHECK-P9-NEXT: stwx r3, r5, r4 +; CHECK-P9-NEXT: lxv v2, -16(r1) +; CHECK-P9-NEXT: blr +entry: + %0 = bitcast i8* %b to float* + %add.ptr1 = getelementptr inbounds i8, i8* %b, i64 1 + %1 = bitcast i8* %add.ptr1 to float* + %2 = load float, float* %0, align 4 + %vecins = insertelement <4 x float> %a, float %2, i32 %idx1 + %3 = load float, float* %1, align 4 + %vecins2 = insertelement <4 x float> %vecins, float %3, i32 %idx2 + ret <4 x float> %vecins2 +} + +define <4 x float> @testFloat3(<4 x float> %a, i8* %b, i32 zeroext %idx1, i32 zeroext %idx2) { +; CHECK-LABEL: testFloat3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: plwz r3, 65536(r5), 0 +; CHECK-NEXT: extsw r4, r6 +; CHECK-NEXT: slwi r4, r4, 2 +; CHECK-NEXT: vinswrx v2, r4, r3 +; CHECK-NEXT: li r3, 1 +; CHECK-NEXT: extsw r4, r7 +; CHECK-NEXT: rldic r3, r3, 36, 27 +; CHECK-NEXT: slwi r4, r4, 2 +; CHECK-NEXT: lwzx r3, r5, r3 +; CHECK-NEXT: vinswrx v2, r4, r3 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testFloat3: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: plwz r3, 65536(r5), 0 +; CHECK-BE-NEXT: extsw r4, r6 +; CHECK-BE-NEXT: slwi r4, r4, 2 +; CHECK-BE-NEXT: vinswlx v2, r4, r3 +; CHECK-BE-NEXT: li r3, 1 +; CHECK-BE-NEXT: extsw r4, r7 +; CHECK-BE-NEXT: rldic r3, r3, 36, 27 +; CHECK-BE-NEXT: slwi r4, r4, 2 +; CHECK-BE-NEXT: lwzx r3, r5, r3 +; CHECK-BE-NEXT: vinswlx v2, r4, r3 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testFloat3: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lis r3, 1 +; CHECK-P9-NEXT: rlwinm r4, r6, 2, 28, 29 +; CHECK-P9-NEXT: addi r6, r1, -32 +; CHECK-P9-NEXT: lwzx r3, r5, r3 +; CHECK-P9-NEXT: stxv v2, -32(r1) +; CHECK-P9-NEXT: stwx r3, r6, r4 +; CHECK-P9-NEXT: li r3, 1 +; CHECK-P9-NEXT: rlwinm r4, r7, 2, 28, 29 +; CHECK-P9-NEXT: lxv vs0, -32(r1) +; CHECK-P9-NEXT: rldic r3, r3, 36, 27 +; CHECK-P9-NEXT: lwzx r3, r5, r3 +; CHECK-P9-NEXT: addi r5, r1, -16 +; CHECK-P9-NEXT: stxv vs0, -16(r1) +; CHECK-P9-NEXT: stwx r3, r5, r4 +; CHECK-P9-NEXT: lxv v2, -16(r1) +; CHECK-P9-NEXT: blr +entry: + %add.ptr = getelementptr inbounds i8, i8* %b, i64 65536 + %0 = bitcast i8* %add.ptr to float* + %add.ptr1 = getelementptr inbounds i8, i8* %b, i64 68719476736 + %1 = bitcast i8* %add.ptr1 to float* + %2 = load float, float* %0, align 4 + %vecins = insertelement <4 x float> %a, float %2, i32 %idx1 + %3 = load float, float* %1, align 4 + %vecins2 = insertelement <4 x float> %vecins, float %3, i32 %idx2 + ret <4 x float> %vecins2 +} + +; Float immediate + +define <4 x float> @testFloatImm1(<4 x float> %a, float %b) { +; CHECK-LABEL: testFloatImm1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xscvdpspn vs0, f1 +; CHECK-NEXT: xxsldwi vs0, vs0, vs0, 3 +; CHECK-NEXT: xxinsertw v2, vs0, 12 +; CHECK-NEXT: xxinsertw v2, vs0, 4 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testFloatImm1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xscvdpspn vs0, f1 +; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; CHECK-BE-NEXT: xxinsertw v2, vs0, 0 +; CHECK-BE-NEXT: xxinsertw v2, vs0, 8 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testFloatImm1: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: xscvdpspn vs0, f1 +; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 3 +; CHECK-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-P9-NEXT: xxinsertw v2, vs0, 8 +; CHECK-P9-NEXT: blr +entry: + %vecins = insertelement <4 x float> %a, float %b, i32 0 + %vecins1 = insertelement <4 x float> %vecins, float %b, i32 2 + ret <4 x float> %vecins1 +} + +define <4 x float> @testFloatImm2(<4 x float> %a, i32* %b) { +; CHECK-LABEL: testFloatImm2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lwz r3, 0(r5) +; CHECK-NEXT: vinsw v2, r3, 12 +; CHECK-NEXT: lwz r3, 4(r5) +; CHECK-NEXT: vinsw v2, r3, 4 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testFloatImm2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lwz r3, 0(r5) +; CHECK-BE-NEXT: vinsw v2, r3, 0 +; CHECK-BE-NEXT: lwz r3, 4(r5) +; CHECK-BE-NEXT: vinsw v2, r3, 8 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testFloatImm2: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lfs f0, 0(r5) +; CHECK-P9-NEXT: xscvdpspn vs0, f0 +; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 3 +; CHECK-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-P9-NEXT: lfs f0, 4(r5) +; CHECK-P9-NEXT: xscvdpspn vs0, f0 +; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 3 +; CHECK-P9-NEXT: xxinsertw v2, vs0, 8 +; CHECK-P9-NEXT: blr +entry: + %0 = bitcast i32* %b to float* + %add.ptr1 = getelementptr inbounds i32, i32* %b, i64 1 + %1 = bitcast i32* %add.ptr1 to float* + %2 = load float, float* %0, align 4 + %vecins = insertelement <4 x float> %a, float %2, i32 0 + %3 = load float, float* %1, align 4 + %vecins2 = insertelement <4 x float> %vecins, float %3, i32 2 + ret <4 x float> %vecins2 +} + +define <4 x float> @testFloatImm3(<4 x float> %a, i32* %b) { +; CHECK-LABEL: testFloatImm3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: plwz r3, 262144(r5), 0 +; CHECK-NEXT: vinsw v2, r3, 12 +; CHECK-NEXT: li r3, 1 +; CHECK-NEXT: rldic r3, r3, 38, 25 +; CHECK-NEXT: lwzx r3, r5, r3 +; CHECK-NEXT: vinsw v2, r3, 4 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testFloatImm3: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: plwz r3, 262144(r5), 0 +; CHECK-BE-NEXT: vinsw v2, r3, 0 +; CHECK-BE-NEXT: li r3, 1 +; CHECK-BE-NEXT: rldic r3, r3, 38, 25 +; CHECK-BE-NEXT: lwzx r3, r5, r3 +; CHECK-BE-NEXT: vinsw v2, r3, 8 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testFloatImm3: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lis r3, 4 +; CHECK-P9-NEXT: lfsx f0, r5, r3 +; CHECK-P9-NEXT: li r3, 1 +; CHECK-P9-NEXT: rldic r3, r3, 38, 25 +; CHECK-P9-NEXT: xscvdpspn vs0, f0 +; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 3 +; CHECK-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-P9-NEXT: lfsx f0, r5, r3 +; CHECK-P9-NEXT: xscvdpspn vs0, f0 +; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 3 +; CHECK-P9-NEXT: xxinsertw v2, vs0, 8 +; CHECK-P9-NEXT: blr +entry: + %add.ptr = getelementptr inbounds i32, i32* %b, i64 65536 + %0 = bitcast i32* %add.ptr to float* + %add.ptr1 = getelementptr inbounds i32, i32* %b, i64 68719476736 + %1 = bitcast i32* %add.ptr1 to float* + %2 = load float, float* %0, align 4 + %vecins = insertelement <4 x float> %a, float %2, i32 0 + %3 = load float, float* %1, align 4 + %vecins2 = insertelement <4 x float> %vecins, float %3, i32 2 + ret <4 x float> %vecins2 +} + +; Double indexed + +define <2 x double> @testDouble1(<2 x double> %a, double %b, i32 zeroext %idx1) { +; CHECK-LABEL: testDouble1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: extsw r4, r6 +; CHECK-NEXT: mffprd r3, f1 +; CHECK-NEXT: rlwinm r4, r4, 3, 0, 28 +; CHECK-NEXT: vinsdrx v2, r4, r3 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testDouble1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: extsw r4, r6 +; CHECK-BE-NEXT: mffprd r3, f1 +; CHECK-BE-NEXT: rlwinm r4, r4, 3, 0, 28 +; CHECK-BE-NEXT: vinsdlx v2, r4, r3 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testDouble1: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: addi r4, r1, -16 +; CHECK-P9-NEXT: rlwinm r3, r6, 3, 28, 28 +; CHECK-P9-NEXT: stxv v2, -16(r1) +; CHECK-P9-NEXT: stfdx f1, r4, r3 +; CHECK-P9-NEXT: lxv v2, -16(r1) +; CHECK-P9-NEXT: blr +entry: + %vecins = insertelement <2 x double> %a, double %b, i32 %idx1 + ret <2 x double> %vecins +} + +define <2 x double> @testDouble2(<2 x double> %a, i8* %b, i32 zeroext %idx1, i32 zeroext %idx2) { +; CHECK-LABEL: testDouble2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld r3, 0(r5) +; CHECK-NEXT: extsw r4, r6 +; CHECK-NEXT: rlwinm r4, r4, 3, 0, 28 +; CHECK-NEXT: vinsdrx v2, r4, r3 +; CHECK-NEXT: pld r3, 1(r5), 0 +; CHECK-NEXT: extsw r4, r7 +; CHECK-NEXT: rlwinm r4, r4, 3, 0, 28 +; CHECK-NEXT: vinsdrx v2, r4, r3 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testDouble2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: ld r3, 0(r5) +; CHECK-BE-NEXT: extsw r4, r6 +; CHECK-BE-NEXT: rlwinm r4, r4, 3, 0, 28 +; CHECK-BE-NEXT: vinsdlx v2, r4, r3 +; CHECK-BE-NEXT: pld r3, 1(r5), 0 +; CHECK-BE-NEXT: extsw r4, r7 +; CHECK-BE-NEXT: rlwinm r4, r4, 3, 0, 28 +; CHECK-BE-NEXT: vinsdlx v2, r4, r3 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testDouble2: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: ld r3, 0(r5) +; CHECK-P9-NEXT: rlwinm r4, r6, 3, 28, 28 +; CHECK-P9-NEXT: addi r6, r1, -32 +; CHECK-P9-NEXT: stxv v2, -32(r1) +; CHECK-P9-NEXT: stdx r3, r6, r4 +; CHECK-P9-NEXT: li r3, 1 +; CHECK-P9-NEXT: rlwinm r4, r7, 3, 28, 28 +; CHECK-P9-NEXT: lxv vs0, -32(r1) +; CHECK-P9-NEXT: ldx r3, r5, r3 +; CHECK-P9-NEXT: addi r5, r1, -16 +; CHECK-P9-NEXT: stxv vs0, -16(r1) +; CHECK-P9-NEXT: stdx r3, r5, r4 +; CHECK-P9-NEXT: lxv v2, -16(r1) +; CHECK-P9-NEXT: blr +entry: + %0 = bitcast i8* %b to double* + %add.ptr1 = getelementptr inbounds i8, i8* %b, i64 1 + %1 = bitcast i8* %add.ptr1 to double* + %2 = load double, double* %0, align 8 + %vecins = insertelement <2 x double> %a, double %2, i32 %idx1 + %3 = load double, double* %1, align 8 + %vecins2 = insertelement <2 x double> %vecins, double %3, i32 %idx2 + ret <2 x double> %vecins2 +} + +define <2 x double> @testDouble3(<2 x double> %a, i8* %b, i32 zeroext %idx1, i32 zeroext %idx2) { +; CHECK-LABEL: testDouble3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pld r3, 65536(r5), 0 +; CHECK-NEXT: extsw r4, r6 +; CHECK-NEXT: rlwinm r4, r4, 3, 0, 28 +; CHECK-NEXT: vinsdrx v2, r4, r3 +; CHECK-NEXT: li r3, 1 +; CHECK-NEXT: extsw r4, r7 +; CHECK-NEXT: rldic r3, r3, 36, 27 +; CHECK-NEXT: rlwinm r4, r4, 3, 0, 28 +; CHECK-NEXT: ldx r3, r5, r3 +; CHECK-NEXT: vinsdrx v2, r4, r3 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testDouble3: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: pld r3, 65536(r5), 0 +; CHECK-BE-NEXT: extsw r4, r6 +; CHECK-BE-NEXT: rlwinm r4, r4, 3, 0, 28 +; CHECK-BE-NEXT: vinsdlx v2, r4, r3 +; CHECK-BE-NEXT: li r3, 1 +; CHECK-BE-NEXT: extsw r4, r7 +; CHECK-BE-NEXT: rldic r3, r3, 36, 27 +; CHECK-BE-NEXT: rlwinm r4, r4, 3, 0, 28 +; CHECK-BE-NEXT: ldx r3, r5, r3 +; CHECK-BE-NEXT: vinsdlx v2, r4, r3 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testDouble3: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lis r3, 1 +; CHECK-P9-NEXT: rlwinm r4, r6, 3, 28, 28 +; CHECK-P9-NEXT: addi r6, r1, -32 +; CHECK-P9-NEXT: ldx r3, r5, r3 +; CHECK-P9-NEXT: stxv v2, -32(r1) +; CHECK-P9-NEXT: stdx r3, r6, r4 +; CHECK-P9-NEXT: li r3, 1 +; CHECK-P9-NEXT: rlwinm r4, r7, 3, 28, 28 +; CHECK-P9-NEXT: lxv vs0, -32(r1) +; CHECK-P9-NEXT: rldic r3, r3, 36, 27 +; CHECK-P9-NEXT: ldx r3, r5, r3 +; CHECK-P9-NEXT: addi r5, r1, -16 +; CHECK-P9-NEXT: stxv vs0, -16(r1) +; CHECK-P9-NEXT: stdx r3, r5, r4 +; CHECK-P9-NEXT: lxv v2, -16(r1) +; CHECK-P9-NEXT: blr +entry: + %add.ptr = getelementptr inbounds i8, i8* %b, i64 65536 + %0 = bitcast i8* %add.ptr to double* + %add.ptr1 = getelementptr inbounds i8, i8* %b, i64 68719476736 + %1 = bitcast i8* %add.ptr1 to double* + %2 = load double, double* %0, align 8 + %vecins = insertelement <2 x double> %a, double %2, i32 %idx1 + %3 = load double, double* %1, align 8 + %vecins2 = insertelement <2 x double> %vecins, double %3, i32 %idx2 + ret <2 x double> %vecins2 +} + +; Double immediate + +define <2 x double> @testDoubleImm1(<2 x double> %a, double %b) { +; CHECK-LABEL: testDoubleImm1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; CHECK-NEXT: xxmrghd v2, v2, vs1 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testDoubleImm1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; CHECK-BE-NEXT: xxpermdi v2, vs1, v2, 1 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testDoubleImm1: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; CHECK-P9-NEXT: xxpermdi v2, vs1, v2, 1 +; CHECK-P9-NEXT: blr +entry: + %vecins = insertelement <2 x double> %a, double %b, i32 0 + ret <2 x double> %vecins +} + +define <2 x double> @testDoubleImm2(<2 x double> %a, i32* %b) { +; CHECK-LABEL: testDoubleImm2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lfd f0, 0(r5) +; CHECK-NEXT: xxmrghd v2, v2, vs0 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testDoubleImm2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lfd f0, 0(r5) +; CHECK-BE-NEXT: xxpermdi v2, vs0, v2, 1 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testDoubleImm2: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lfd f0, 0(r5) +; CHECK-P9-NEXT: xxpermdi v2, vs0, v2, 1 +; CHECK-P9-NEXT: blr +entry: + %0 = bitcast i32* %b to double* + %1 = load double, double* %0, align 8 + %vecins = insertelement <2 x double> %a, double %1, i32 0 + ret <2 x double> %vecins +} + +define <2 x double> @testDoubleImm3(<2 x double> %a, i32* %b) { +; CHECK-LABEL: testDoubleImm3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lfd f0, 4(r5) +; CHECK-NEXT: xxmrghd v2, v2, vs0 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testDoubleImm3: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lfd f0, 4(r5) +; CHECK-BE-NEXT: xxpermdi v2, vs0, v2, 1 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testDoubleImm3: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lfd f0, 4(r5) +; CHECK-P9-NEXT: xxpermdi v2, vs0, v2, 1 +; CHECK-P9-NEXT: blr +entry: + %add.ptr = getelementptr inbounds i32, i32* %b, i64 1 + %0 = bitcast i32* %add.ptr to double* + %1 = load double, double* %0, align 8 + %vecins = insertelement <2 x double> %a, double %1, i32 0 + ret <2 x double> %vecins +} + +define <2 x double> @testDoubleImm4(<2 x double> %a, i32* %b) { +; CHECK-LABEL: testDoubleImm4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lis r3, 4 +; CHECK-NEXT: lfdx f0, r5, r3 +; CHECK-NEXT: xxmrghd v2, v2, vs0 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testDoubleImm4: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lis r3, 4 +; CHECK-BE-NEXT: lfdx f0, r5, r3 +; CHECK-BE-NEXT: xxpermdi v2, vs0, v2, 1 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testDoubleImm4: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lis r3, 4 +; CHECK-P9-NEXT: lfdx f0, r5, r3 +; CHECK-P9-NEXT: xxpermdi v2, vs0, v2, 1 +; CHECK-P9-NEXT: blr +entry: + %add.ptr = getelementptr inbounds i32, i32* %b, i64 65536 + %0 = bitcast i32* %add.ptr to double* + %1 = load double, double* %0, align 8 + %vecins = insertelement <2 x double> %a, double %1, i32 0 + ret <2 x double> %vecins +} + +define <2 x double> @testDoubleImm5(<2 x double> %a, i32* %b) { +; CHECK-LABEL: testDoubleImm5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r3, 1 +; CHECK-NEXT: rldic r3, r3, 38, 25 +; CHECK-NEXT: lfdx f0, r5, r3 +; CHECK-NEXT: xxmrghd v2, v2, vs0 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testDoubleImm5: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r3, 1 +; CHECK-BE-NEXT: rldic r3, r3, 38, 25 +; CHECK-BE-NEXT: lfdx f0, r5, r3 +; CHECK-BE-NEXT: xxpermdi v2, vs0, v2, 1 +; CHECK-BE-NEXT: blr +; +; CHECK-P9-LABEL: testDoubleImm5: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: li r3, 1 +; CHECK-P9-NEXT: rldic r3, r3, 38, 25 +; CHECK-P9-NEXT: lfdx f0, r5, r3 +; CHECK-P9-NEXT: xxpermdi v2, vs0, v2, 1 +; CHECK-P9-NEXT: blr +entry: + %add.ptr = getelementptr inbounds i32, i32* %b, i64 68719476736 + %0 = bitcast i32* %add.ptr to double* + %1 = load double, double* %0, align 8 + %vecins = insertelement <2 x double> %a, double %1, i32 0 + ret <2 x double> %vecins +} +