diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -573,6 +573,8 @@ BUILTIN(__builtin_vsx_lxvll, "V4ivC*ULLi", "") BUILTIN(__builtin_vsx_stxvl, "vV4iv*ULLi", "") BUILTIN(__builtin_vsx_stxvll, "vV4iv*ULLi", "") +BUILTIN(__builtin_vsx_ldrmb, "V16UcCc*Ii", "") +BUILTIN(__builtin_vsx_strmb, "vCc*IiV16Uc", "") BUILTIN(__builtin_vsx_xvmaxdp, "V2dV2dV2d", "") BUILTIN(__builtin_vsx_xvmaxsp, "V4fV4fV4f", "") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -15069,6 +15069,143 @@ llvm::Function *F = CGM.getIntrinsic(ID); return Builder.CreateCall(F, Ops, ""); } + case PPC::BI__builtin_vsx_ldrmb: { + // Essentially boils down to performing an unaligned VMX load sequence so + // as to avoid crossing a page boundary and then shuffling the elements + // into the right side of the vector register. + int64_t NumBytes = cast(Ops[1])->getZExtValue(); + llvm::Type *ResTy = ConvertType(E->getType()); + bool IsLE = getTarget().isLittleEndian(); + + // If the user wants the entire vector, just load the entire vector. + if (NumBytes == 16) { + Value *BC = Builder.CreateBitCast(Ops[0], ResTy->getPointerTo()); + Value *LD = Builder.CreateLoad(Address(BC, CharUnits::fromQuantity(1))); + if (!IsLE) + return LD; + + // Reverse the bytes on LE. + SmallVector RevMask; + for (int Idx = 0; Idx < 16; Idx++) + RevMask.push_back(15 - Idx); + return Builder.CreateShuffleVector(LD, LD, RevMask); + } + + llvm::Function *Lvx = CGM.getIntrinsic(Intrinsic::ppc_altivec_lvx); + llvm::Function *Lvs = CGM.getIntrinsic(IsLE ? Intrinsic::ppc_altivec_lvsr + : Intrinsic::ppc_altivec_lvsl); + llvm::Function *Vperm = CGM.getIntrinsic(Intrinsic::ppc_altivec_vperm); + Value *HiMem = Builder.CreateGEP( + Int8Ty, Ops[0], ConstantInt::get(Ops[1]->getType(), NumBytes - 1)); + Value *LoLd = Builder.CreateCall(Lvx, Ops[0], "ld.lo"); + Value *HiLd = Builder.CreateCall(Lvx, HiMem, "ld.hi"); + Value *Mask1 = Builder.CreateCall(Lvs, Ops[0], "mask1"); + + Ops.clear(); + Ops.push_back(IsLE ? HiLd : LoLd); + Ops.push_back(IsLE ? LoLd : HiLd); + Ops.push_back(Mask1); + Value *AllElts = Builder.CreateCall(Vperm, Ops, "shuffle1"); + Constant *Zero = llvm::Constant::getNullValue(IsLE ? ResTy : AllElts->getType()); + + if (IsLE) { + SmallVector Consts; + for (int Idx = 0; Idx < 16; Idx++) { + int Val = (NumBytes - Idx - 1 >= 0) ? (NumBytes - Idx - 1) + : 16 - (NumBytes - Idx); + Consts.push_back(Val); + } + return Builder.CreateShuffleVector(Builder.CreateBitCast(AllElts, ResTy), + Zero, Consts); + } + SmallVector Consts; + for (int Idx = 0; Idx < 16; Idx++) + Consts.push_back(Builder.getInt8(NumBytes + Idx)); + Value *Mask2 = ConstantVector::get(Consts); + return Builder.CreateBitCast( + Builder.CreateCall(Vperm, {Zero, AllElts, Mask2}, "shuffle2"), ResTy); + } + case PPC::BI__builtin_vsx_strmb: { + int64_t NumBytes = cast(Ops[1])->getZExtValue(); + bool IsLE = getTarget().isLittleEndian(); + auto StoreSubVec = [&](unsigned Width, unsigned Offset, unsigned EltNo) { + // Storing the whole vector, simply store it on BE and reverse bytes and + // store on LE. + if (Width == 16) { + Value *BC = + Builder.CreateBitCast(Ops[0], Ops[2]->getType()->getPointerTo()); + Value *StVec = Ops[2]; + if (IsLE) { + SmallVector RevMask; + for (int Idx = 0; Idx < 16; Idx++) + RevMask.push_back(15 - Idx); + StVec = Builder.CreateShuffleVector(Ops[2], Ops[2], RevMask); + } + return Builder.CreateStore(StVec, + Address(BC, CharUnits::fromQuantity(1))); + } + auto *ConvTy = Int64Ty; + unsigned NumElts = 0; + switch (Width) { + default: + llvm_unreachable("width for stores must be a power of 2"); + case 8: + ConvTy = Int64Ty; + NumElts = 2; + break; + case 4: + ConvTy = Int32Ty; + NumElts = 4; + break; + case 2: + ConvTy = Int16Ty; + NumElts = 8; + break; + case 1: + ConvTy = Int8Ty; + NumElts = 16; + break; + } + Value *Vec = Builder.CreateBitCast( + Ops[2], llvm::FixedVectorType::get(ConvTy, NumElts)); + Value *Ptr = Builder.CreateGEP(Int8Ty, Ops[0], + ConstantInt::get(Int64Ty, Offset)); + Value *PtrBC = Builder.CreateBitCast(Ptr, ConvTy->getPointerTo()); + Value *Elt = Builder.CreateExtractElement(Vec, EltNo); + if (IsLE && Width > 1) { + Function *F = CGM.getIntrinsic(Intrinsic::bswap, ConvTy); + Elt = Builder.CreateCall(F, Elt); + } + return Builder.CreateStore(Elt, + Address(PtrBC, CharUnits::fromQuantity(1))); + }; + unsigned Stored = 0; + unsigned RemainingBytes = NumBytes; + Value *Result; + if (NumBytes == 16) + return StoreSubVec(16, 0, 0); + if (NumBytes >= 8) { + Result = StoreSubVec(8, NumBytes - 8, IsLE ? 0 : 1); + RemainingBytes -= 8; + Stored += 8; + } + if (RemainingBytes >= 4) { + Result = StoreSubVec(4, NumBytes - Stored - 4, + IsLE ? (Stored >> 2) : 3 - (Stored >> 2)); + RemainingBytes -= 4; + Stored += 4; + } + if (RemainingBytes >= 2) { + Result = StoreSubVec(2, NumBytes - Stored - 2, + IsLE ? (Stored >> 1) : 7 - (Stored >> 1)); + RemainingBytes -= 2; + Stored += 2; + } + if (RemainingBytes) + Result = + StoreSubVec(1, NumBytes - Stored - 1, IsLE ? Stored : 15 - Stored); + return Result; + } // Square root case PPC::BI__builtin_vsx_xvsqrtsp: case PPC::BI__builtin_vsx_xvsqrtdp: { diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h --- a/clang/lib/Headers/altivec.h +++ b/clang/lib/Headers/altivec.h @@ -3143,6 +3143,15 @@ #endif #endif +#if defined(__POWER9_VECTOR__) && defined(__powerpc64__) +#define __vec_ldrmb(PTR, CNT) vec_xl_len_r((const unsigned char *)(PTR), (CNT)) +#define __vec_strmb(PTR, CNT, VAL) \ + vec_xst_len_r((VAL), (unsigned char *)(PTR), (CNT)) +#else +#define __vec_ldrmb __builtin_vsx_ldrmb +#define __vec_strmb __builtin_vsx_strmb +#endif + /* vec_cpsgn */ #ifdef __VSX__ diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3440,6 +3440,11 @@ case PPC::BI__builtin_ppc_lbarx: return SemaFeatureCheck(*this, TheCall, "isa-v207-instructions", diag::err_ppc_builtin_only_on_arch, "8"); + case PPC::BI__builtin_vsx_ldrmb: + case PPC::BI__builtin_vsx_strmb: + return SemaFeatureCheck(*this, TheCall, "isa-v207-instructions", + diag::err_ppc_builtin_only_on_arch, "8") || + SemaBuiltinConstantArgRange(TheCall, 1, 1, 16); #define CUSTOM_BUILTIN(Name, Intr, Types, Acc) \ case PPC::BI__builtin_##Name: \ return SemaBuiltinPPCMMACall(TheCall, Types); diff --git a/clang/test/CodeGen/builtins-ppc-ld-st-rmb.c b/clang/test/CodeGen/builtins-ppc-ld-st-rmb.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/builtins-ppc-ld-st-rmb.c @@ -0,0 +1,2256 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: powerpc-registered-target +// RUN: %clang_cc1 -triple powerpc64-unknown-unknown -emit-llvm %s \ +// RUN: -target-cpu pwr8 -o - | FileCheck %s -check-prefix=BE-PWR8 +// RUN: %clang_cc1 -triple powerpc64le-unknown-unknown -emit-llvm %s \ +// RUN: -target-cpu pwr8 -o - | FileCheck %s -check-prefix=LE-PWR8 + +// RUN: %clang_cc1 -triple powerpc64-unknown-unknown -emit-llvm %s \ +// RUN: -target-cpu pwr9 -o - | FileCheck %s -check-prefix=BE-PWR9 +// RUN: %clang_cc1 -triple powerpc64le-unknown-unknown -emit-llvm %s \ +// RUN: -target-cpu pwr9 -o - | FileCheck %s -check-prefix=LE-PWR9 +// RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm %s \ +// RUN: -target-cpu pwr9 -o - | FileCheck %s -check-prefix=BE32-PWR9 + +#include +// BE-PWR8-LABEL: @test_ldrmb1( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i32 0 +// BE-PWR8-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP0]]) +// BE-PWR8-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP1]]) +// BE-PWR8-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP0]]) +// BE-PWR8-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]]) +// BE-PWR8-NEXT: [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> ) +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8> +// BE-PWR8-NEXT: ret <16 x i8> [[TMP2]] +// +// LE-PWR8-LABEL: @test_ldrmb1( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i32 0 +// LE-PWR8-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP0]]) +// LE-PWR8-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP1]]) +// LE-PWR8-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP0]]) +// LE-PWR8-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_HI]], <4 x i32> [[LD_LO]], <16 x i8> [[MASK1]]) +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE1]] to <16 x i8> +// LE-PWR8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> zeroinitializer, <16 x i32> +// LE-PWR8-NEXT: ret <16 x i8> [[TMP3]] +// +// BE-PWR9-LABEL: @test_ldrmb1( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store i8* [[TMP0]], i8** [[__A_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 1, i64* [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56 +// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR3:[0-9]+]] +// BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// BE-PWR9-NEXT: ret <16 x i8> [[TMP5]] +// +// LE-PWR9-LABEL: @test_ldrmb1( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store i8* [[TMP0]], i8** [[__A_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 1, i64* [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56 +// LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4:[0-9]+]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP14]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: ret <16 x i8> [[TMP15]] +// +// BE32-PWR9-LABEL: @test_ldrmb1( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i32 0 +// BE32-PWR9-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP0]]) +// BE32-PWR9-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP1]]) +// BE32-PWR9-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP0]]) +// BE32-PWR9-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]]) +// BE32-PWR9-NEXT: [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> ) +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8> +// BE32-PWR9-NEXT: ret <16 x i8> [[TMP2]] +// +vector unsigned char test_ldrmb1(char *ptr) { return __vec_ldrmb(ptr, 1); } + +// BE-PWR8-LABEL: @test_strmb1( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[TMP1]], i64 15 +// BE-PWR8-NEXT: store i8 [[TMP3]], i8* [[TMP2]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb1( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0 +// LE-PWR8-NEXT: store i8 [[TMP3]], i8* [[TMP2]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb1( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 1, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb1( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 1, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb1( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[TMP1]], i64 15 +// BE32-PWR9-NEXT: store i8 [[TMP3]], i8* [[TMP2]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb1(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 1, data); +} + +// BE-PWR8-LABEL: @test_strmb2( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i16* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7 +// BE-PWR8-NEXT: store i16 [[TMP5]], i16* [[TMP4]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb2( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i16* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) +// LE-PWR8-NEXT: store i16 [[TMP6]], i16* [[TMP4]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb2( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 2, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb2( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 2, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb2( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i16* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7 +// BE32-PWR9-NEXT: store i16 [[TMP5]], i16* [[TMP4]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb2(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 2, data); +} + +// BE-PWR8-LABEL: @test_strmb3( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i16* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7 +// BE-PWR8-NEXT: store i16 [[TMP5]], i16* [[TMP4]], align 1 +// BE-PWR8-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 13 +// BE-PWR8-NEXT: store i8 [[TMP7]], i8* [[TMP6]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb3( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i16* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) +// LE-PWR8-NEXT: store i16 [[TMP6]], i16* [[TMP4]], align 1 +// LE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[TMP1]], i64 2 +// LE-PWR8-NEXT: store i8 [[TMP8]], i8* [[TMP7]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb3( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 3, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb3( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 3, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb3( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i16* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7 +// BE32-PWR9-NEXT: store i16 [[TMP5]], i16* [[TMP4]], align 1 +// BE32-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 13 +// BE32-PWR9-NEXT: store i8 [[TMP7]], i8* [[TMP6]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb3(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 3, data); +} + +// BE-PWR8-LABEL: @test_strmb4( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 +// BE-PWR8-NEXT: store i32 [[TMP5]], i32* [[TMP4]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb4( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +// LE-PWR8-NEXT: store i32 [[TMP6]], i32* [[TMP4]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb4( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 4, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb4( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 4, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb4( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 +// BE32-PWR9-NEXT: store i32 [[TMP5]], i32* [[TMP4]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb4(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 4, data); +} + +// BE-PWR8-LABEL: @test_strmb5( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 +// BE-PWR8-NEXT: store i32 [[TMP5]], i32* [[TMP4]], align 1 +// BE-PWR8-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 11 +// BE-PWR8-NEXT: store i8 [[TMP7]], i8* [[TMP6]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb5( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +// LE-PWR8-NEXT: store i32 [[TMP6]], i32* [[TMP4]], align 1 +// LE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[TMP1]], i64 4 +// LE-PWR8-NEXT: store i8 [[TMP8]], i8* [[TMP7]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb5( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 5, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb5( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 5, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb5( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 +// BE32-PWR9-NEXT: store i32 [[TMP5]], i32* [[TMP4]], align 1 +// BE32-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 11 +// BE32-PWR9-NEXT: store i8 [[TMP7]], i8* [[TMP6]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb5(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 5, data); +} + +// BE-PWR8-LABEL: @test_strmb6( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 +// BE-PWR8-NEXT: store i32 [[TMP5]], i32* [[TMP4]], align 1 +// BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16* +// BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5 +// BE-PWR8-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb6( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +// LE-PWR8-NEXT: store i32 [[TMP6]], i32* [[TMP4]], align 1 +// LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* +// LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 2 +// LE-PWR8-NEXT: [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]]) +// LE-PWR8-NEXT: store i16 [[TMP11]], i16* [[TMP9]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb6( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 6, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb6( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 6, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb6( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 +// BE32-PWR9-NEXT: store i32 [[TMP5]], i32* [[TMP4]], align 1 +// BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16* +// BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5 +// BE32-PWR9-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb6(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 6, data); +} + +// BE-PWR8-LABEL: @test_strmb7( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 +// BE-PWR8-NEXT: store i32 [[TMP5]], i32* [[TMP4]], align 1 +// BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE-PWR8-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16* +// BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5 +// BE-PWR8-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +// BE-PWR8-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 9 +// BE-PWR8-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb7( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +// LE-PWR8-NEXT: store i32 [[TMP6]], i32* [[TMP4]], align 1 +// LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// LE-PWR8-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* +// LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 2 +// LE-PWR8-NEXT: [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]]) +// LE-PWR8-NEXT: store i16 [[TMP11]], i16* [[TMP9]], align 1 +// LE-PWR8-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[TMP1]], i64 6 +// LE-PWR8-NEXT: store i8 [[TMP13]], i8* [[TMP12]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb7( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 7, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb7( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 7, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb7( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3 +// BE32-PWR9-NEXT: store i32 [[TMP5]], i32* [[TMP4]], align 1 +// BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE32-PWR9-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16* +// BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5 +// BE32-PWR9-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +// BE32-PWR9-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 9 +// BE32-PWR9-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb7(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 7, data); +} + +// BE-PWR8-LABEL: @test_strmb8( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE-PWR8-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb8( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +// LE-PWR8-NEXT: store i64 [[TMP6]], i64* [[TMP4]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb8( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 8, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb8( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 8, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb8( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE32-PWR9-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb8(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 8, data); +} +// BE-PWR8-LABEL: @test_ldrmb9( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i32 8 +// BE-PWR8-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP0]]) +// BE-PWR8-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP1]]) +// BE-PWR8-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP0]]) +// BE-PWR8-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]]) +// BE-PWR8-NEXT: [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> ) +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8> +// BE-PWR8-NEXT: ret <16 x i8> [[TMP2]] +// +// LE-PWR8-LABEL: @test_ldrmb9( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i32 8 +// LE-PWR8-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP0]]) +// LE-PWR8-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP1]]) +// LE-PWR8-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP0]]) +// LE-PWR8-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_HI]], <4 x i32> [[LD_LO]], <16 x i8> [[MASK1]]) +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE1]] to <16 x i8> +// LE-PWR8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> zeroinitializer, <16 x i32> +// LE-PWR8-NEXT: ret <16 x i8> [[TMP3]] +// +// BE-PWR9-LABEL: @test_ldrmb9( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store i8* [[TMP0]], i8** [[__A_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 9, i64* [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56 +// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// BE-PWR9-NEXT: ret <16 x i8> [[TMP5]] +// +// LE-PWR9-LABEL: @test_ldrmb9( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store i8* [[TMP0]], i8** [[__A_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 9, i64* [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56 +// LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP14]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: ret <16 x i8> [[TMP15]] +// +// BE32-PWR9-LABEL: @test_ldrmb9( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i32 8 +// BE32-PWR9-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP0]]) +// BE32-PWR9-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP1]]) +// BE32-PWR9-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP0]]) +// BE32-PWR9-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]]) +// BE32-PWR9-NEXT: [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> ) +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8> +// BE32-PWR9-NEXT: ret <16 x i8> [[TMP2]] +// +vector unsigned char test_ldrmb9(char *ptr) { return __vec_ldrmb(ptr, 9); } + +// BE-PWR8-LABEL: @test_strmb9( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE-PWR8-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE-PWR8-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 7 +// BE-PWR8-NEXT: store i8 [[TMP7]], i8* [[TMP6]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb9( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +// LE-PWR8-NEXT: store i64 [[TMP6]], i64* [[TMP4]], align 1 +// LE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[TMP1]], i64 8 +// LE-PWR8-NEXT: store i8 [[TMP8]], i8* [[TMP7]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb9( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 9, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb9( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 9, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb9( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE32-PWR9-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE32-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 7 +// BE32-PWR9-NEXT: store i8 [[TMP7]], i8* [[TMP6]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb9(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 9, data); +} + +// BE-PWR8-LABEL: @test_strmb10( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE-PWR8-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16* +// BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3 +// BE-PWR8-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb10( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +// LE-PWR8-NEXT: store i64 [[TMP6]], i64* [[TMP4]], align 1 +// LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* +// LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 4 +// LE-PWR8-NEXT: [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]]) +// LE-PWR8-NEXT: store i16 [[TMP11]], i16* [[TMP9]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb10( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 10, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb10( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 10, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb10( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE32-PWR9-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16* +// BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3 +// BE32-PWR9-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb10(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 10, data); +} + +// BE-PWR8-LABEL: @test_strmb11( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE-PWR8-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE-PWR8-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16* +// BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3 +// BE-PWR8-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +// BE-PWR8-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 5 +// BE-PWR8-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb11( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +// LE-PWR8-NEXT: store i64 [[TMP6]], i64* [[TMP4]], align 1 +// LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// LE-PWR8-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* +// LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 4 +// LE-PWR8-NEXT: [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]]) +// LE-PWR8-NEXT: store i16 [[TMP11]], i16* [[TMP9]], align 1 +// LE-PWR8-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[TMP1]], i64 10 +// LE-PWR8-NEXT: store i8 [[TMP13]], i8* [[TMP12]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb11( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 11, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb11( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 11, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb11( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE32-PWR9-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE32-PWR9-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16* +// BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3 +// BE32-PWR9-NEXT: store i16 [[TMP9]], i16* [[TMP8]], align 1 +// BE32-PWR9-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 5 +// BE32-PWR9-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb11(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 11, data); +} + +// BE-PWR8-LABEL: @test_strmb12( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE-PWR8-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32* +// BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 +// BE-PWR8-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb12( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +// LE-PWR8-NEXT: store i64 [[TMP6]], i64* [[TMP4]], align 1 +// LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2 +// LE-PWR8-NEXT: [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]]) +// LE-PWR8-NEXT: store i32 [[TMP11]], i32* [[TMP9]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb12( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 12, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb12( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 12, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb12( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE32-PWR9-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32* +// BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 +// BE32-PWR9-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb12(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 12, data); +} + +// BE-PWR8-LABEL: @test_strmb13( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 5 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE-PWR8-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE-PWR8-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32* +// BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 +// BE-PWR8-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +// BE-PWR8-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 3 +// BE-PWR8-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb13( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 5 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +// LE-PWR8-NEXT: store i64 [[TMP6]], i64* [[TMP4]], align 1 +// LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// LE-PWR8-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2 +// LE-PWR8-NEXT: [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]]) +// LE-PWR8-NEXT: store i32 [[TMP11]], i32* [[TMP9]], align 1 +// LE-PWR8-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[TMP1]], i64 12 +// LE-PWR8-NEXT: store i8 [[TMP13]], i8* [[TMP12]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb13( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 13, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb13( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 13, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb13( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 5 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE32-PWR9-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE32-PWR9-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32* +// BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 +// BE32-PWR9-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +// BE32-PWR9-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 3 +// BE32-PWR9-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb13(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 13, data); +} + +// BE-PWR8-LABEL: @test_strmb14( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 6 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE-PWR8-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2 +// BE-PWR8-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32* +// BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 +// BE-PWR8-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +// BE-PWR8-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE-PWR8-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i16* +// BE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1 +// BE-PWR8-NEXT: store i16 [[TMP13]], i16* [[TMP12]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb14( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 6 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +// LE-PWR8-NEXT: store i64 [[TMP6]], i64* [[TMP4]], align 1 +// LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2 +// LE-PWR8-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2 +// LE-PWR8-NEXT: [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]]) +// LE-PWR8-NEXT: store i32 [[TMP11]], i32* [[TMP9]], align 1 +// LE-PWR8-NEXT: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// LE-PWR8-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i16* +// LE-PWR8-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP12]], i64 6 +// LE-PWR8-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +// LE-PWR8-NEXT: store i16 [[TMP16]], i16* [[TMP14]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb14( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 14, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb14( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 14, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb14( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 6 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE32-PWR9-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2 +// BE32-PWR9-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32* +// BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 +// BE32-PWR9-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +// BE32-PWR9-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE32-PWR9-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i16* +// BE32-PWR9-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1 +// BE32-PWR9-NEXT: store i16 [[TMP13]], i16* [[TMP12]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb14(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 14, data); +} + +// BE-PWR8-LABEL: @test_strmb15( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 7 +// BE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE-PWR8-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3 +// BE-PWR8-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32* +// BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 +// BE-PWR8-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +// BE-PWR8-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE-PWR8-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE-PWR8-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i16* +// BE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1 +// BE-PWR8-NEXT: store i16 [[TMP13]], i16* [[TMP12]], align 1 +// BE-PWR8-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE-PWR8-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP1]], i64 1 +// BE-PWR8-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb15( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 7 +// LE-PWR8-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +// LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +// LE-PWR8-NEXT: store i64 [[TMP6]], i64* [[TMP4]], align 1 +// LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3 +// LE-PWR8-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2 +// LE-PWR8-NEXT: [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]]) +// LE-PWR8-NEXT: store i32 [[TMP11]], i32* [[TMP9]], align 1 +// LE-PWR8-NEXT: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// LE-PWR8-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// LE-PWR8-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i16* +// LE-PWR8-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP12]], i64 6 +// LE-PWR8-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +// LE-PWR8-NEXT: store i16 [[TMP16]], i16* [[TMP14]], align 1 +// LE-PWR8-NEXT: [[TMP17:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// LE-PWR8-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[TMP1]], i64 14 +// LE-PWR8-NEXT: store i8 [[TMP18]], i8* [[TMP17]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb15( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 15, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb15( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 15, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb15( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 7 +// BE32-PWR9-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64* +// BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +// BE32-PWR9-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 1 +// BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3 +// BE32-PWR9-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32* +// BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1 +// BE32-PWR9-NEXT: store i32 [[TMP9]], i32* [[TMP8]], align 1 +// BE32-PWR9-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// BE32-PWR9-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// BE32-PWR9-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i16* +// BE32-PWR9-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1 +// BE32-PWR9-NEXT: store i16 [[TMP13]], i16* [[TMP12]], align 1 +// BE32-PWR9-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0 +// BE32-PWR9-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP1]], i64 1 +// BE32-PWR9-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb15(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 15, data); +} +// BE-PWR8-LABEL: @test_ldrmb16( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// BE-PWR8-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 +// BE-PWR8-NEXT: ret <16 x i8> [[TMP2]] +// +// LE-PWR8-LABEL: @test_ldrmb16( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// LE-PWR8-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 +// LE-PWR8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> [[TMP2]], <16 x i32> +// LE-PWR8-NEXT: ret <16 x i8> [[TMP3]] +// +// BE-PWR9-LABEL: @test_ldrmb16( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store i8* [[TMP0]], i8** [[__A_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 16, i64* [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56 +// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// BE-PWR9-NEXT: ret <16 x i8> [[TMP5]] +// +// LE-PWR9-LABEL: @test_ldrmb16( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store i8* [[TMP0]], i8** [[__A_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 16, i64* [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56 +// LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP14]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: ret <16 x i8> [[TMP15]] +// +// BE32-PWR9-LABEL: @test_ldrmb16( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// BE32-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 +// BE32-PWR9-NEXT: ret <16 x i8> [[TMP2]] +// +vector unsigned char test_ldrmb16(char *ptr) { return __vec_ldrmb(ptr, 16); } + +// BE-PWR8-LABEL: @test_strmb16( +// BE-PWR8-NEXT: entry: +// BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// BE-PWR8-NEXT: store <16 x i8> [[TMP1]], <16 x i8>* [[TMP2]], align 1 +// BE-PWR8-NEXT: ret void +// +// LE-PWR8-LABEL: @test_strmb16( +// LE-PWR8-NEXT: entry: +// LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR8-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// LE-PWR8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> +// LE-PWR8-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* [[TMP2]], align 1 +// LE-PWR8-NEXT: ret void +// +// BE-PWR9-LABEL: @test_strmb16( +// BE-PWR9-NEXT: entry: +// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: store i64 16, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56 +// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]] +// BE-PWR9-NEXT: ret void +// +// LE-PWR9-LABEL: @test_strmb16( +// LE-PWR9-NEXT: entry: +// LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8 +// LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8 +// LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// LE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8 +// LE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: store i64 16, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]] +// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8 +// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]] +// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]] +// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16 +// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16 +// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]] +// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16 +// LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> +// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8 +// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56 +// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]] +// LE-PWR9-NEXT: ret void +// +// BE32-PWR9-LABEL: @test_strmb16( +// BE32-PWR9-NEXT: entry: +// BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 4 +// BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16 +// BE32-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4 +// BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16 +// BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// BE32-PWR9-NEXT: store <16 x i8> [[TMP1]], <16 x i8>* [[TMP2]], align 1 +// BE32-PWR9-NEXT: ret void +// +void test_strmb16(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 16, data); +} diff --git a/clang/test/CodeGen/builtins-ppc-xlcompat-pwr8.c b/clang/test/CodeGen/builtins-ppc-xlcompat-pwr8.c --- a/clang/test/CodeGen/builtins-ppc-xlcompat-pwr8.c +++ b/clang/test/CodeGen/builtins-ppc-xlcompat-pwr8.c @@ -44,3 +44,13 @@ // CHECK-NOPWR8: error: this builtin is only valid on POWER8 or later CPUs return __builtin_ppc_stbcx(c_addr, c); } + +vector unsigned char test_ldrmb(char *ptr) { + // CHECK-NOPWR8: error: this builtin is only valid on POWER8 or later CPUs + return __builtin_vsx_ldrmb(ptr, 14); +} + +void test_strmbb(char *ptr, vector unsigned char data) { + // CHECK-NOPWR8: error: this builtin is only valid on POWER8 or later CPUs + __builtin_vsx_strmb(ptr, 14, data); +} diff --git a/clang/test/CodeGen/builtins-ppc-xlcompat-vec-error.c b/clang/test/CodeGen/builtins-ppc-xlcompat-vec-error.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/builtins-ppc-xlcompat-vec-error.c @@ -0,0 +1,26 @@ +// REQUIRES: powerpc-registered-target +// RUN: %clang_cc1 -triple powerpc64-unknown-unknown -fsyntax-only \ +// RUN: -target-cpu pwr8 -Wall -Werror -verify %s +// RUN: %clang_cc1 -triple powerpc64le-unknown-unknown -fsyntax-only \ +// RUN: -target-cpu pwr8 -Wall -Werror -verify %s +// RUN: %clang_cc1 -triple powerpc64-unknown-aix -fsyntax-only \ +// RUN: -target-cpu pwr8 -Wall -Werror -verify %s +// RUN: %clang_cc1 -triple powerpc-unknown-aix -fsyntax-only \ +// RUN: -target-cpu pwr8 -Wall -Werror -verify %s + +#include +vector unsigned char test_ldrmb(char *ptr) { + return __vec_ldrmb(ptr, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} +} + +void test_strmb(char *ptr, vector unsigned char data) { + __vec_strmb(ptr, 17, data); // expected-error {{argument value 17 is outside the valid range [1, 16]}} +} + +vector unsigned char test_ldrmbb(char *ptr) { + return __builtin_vsx_ldrmb(ptr, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} +} + +void test_strmbb(char *ptr, vector unsigned char data) { + __builtin_vsx_strmb(ptr, 17, data); // expected-error {{argument value 17 is outside the valid range [1, 16]}} +}