Index: llvm/test/Analysis/CostModel/X86/powi.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/CostModel/X86/powi.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -enable-no-nans-fp-math -passes="print" 2>&1 -disable-output -mtriple=x86_64-linux-gnu -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -enable-no-nans-fp-math -passes="print" 2>&1 -disable-output -mtriple=x86_64-linux-gnu -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -enable-no-nans-fp-math -passes="print" 2>&1 -disable-output -mtriple=x86_64-linux-gnu -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -enable-no-nans-fp-math -passes="print" 2>&1 -disable-output -mtriple=x86_64-linux-gnu -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 + +define i32 @powi(i32 %arg) { +; SSE-LABEL: 'powi' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float undef, i32 %arg) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> undef, i32 6) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> undef, i32 6) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> undef, i32 6) +; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> undef, i32 6) +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double undef, i32 6) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> undef, i32 6) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> undef, i32 6) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> undef, i32 6) +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> undef, i32 6) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'powi' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float undef, i32 %arg) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> undef, i32 6) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> undef, i32 6) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> undef, i32 6) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> undef, i32 6) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double undef, i32 6) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> undef, i32 6) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> undef, i32 6) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> undef, i32 6) +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> undef, i32 6) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'powi' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float undef, i32 %arg) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> undef, i32 6) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> undef, i32 6) +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> undef, i32 6) +; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> undef, i32 6) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double undef, i32 6) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> undef, i32 6) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> undef, i32 6) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> undef, i32 6) +; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> undef, i32 6) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'powi' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float undef, i32 %arg) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> undef, i32 6) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> undef, i32 6) +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> undef, i32 6) +; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> undef, i32 6) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double undef, i32 6) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> undef, i32 6) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> undef, i32 6) +; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> undef, i32 6) +; AVX512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> undef, i32 6) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %F32 = call float @llvm.powi.f32(float undef, i32 %arg) + %V2F32 = call <2 x float> @llvm.powi.v2f32(<2 x float> undef, i32 6) + %V4F32 = call <4 x float> @llvm.powi.v4f32(<4 x float> undef, i32 6) + %V8F32 = call <8 x float> @llvm.powi.v8f32(<8 x float> undef, i32 6) + %V16F32 = call <16 x float> @llvm.powi.v16f32(<16 x float> undef, i32 6) + + %F64 = call double @llvm.powi.f64(double undef, i32 6) + %V2F64 = call <2 x double> @llvm.powi.v2f64(<2 x double> undef, i32 6) + %V4F64 = call <4 x double> @llvm.powi.v4f64(<4 x double> undef, i32 6) + %V8F64 = call <8 x double> @llvm.powi.v8f64(<8 x double> undef, i32 6) + %V16F64 = call <16 x double> @llvm.powi.v16f64(<16 x double> undef, i32 6) + + ret i32 undef +} + +declare float @llvm.powi.f32(float, i32) +declare <2 x float> @llvm.powi.v2f32(<2 x float>, i32) +declare <4 x float> @llvm.powi.v4f32(<4 x float>, i32) +declare <8 x float> @llvm.powi.v8f32(<8 x float>, i32) +declare <16 x float> @llvm.powi.v16f32(<16 x float>, i32) + +declare double @llvm.powi.f64(double, i32) +declare <2 x double> @llvm.powi.v2f64(<2 x double>, i32) +declare <4 x double> @llvm.powi.v4f64(<4 x double>, i32) +declare <8 x double> @llvm.powi.v8f64(<8 x double>, i32) +declare <16 x double> @llvm.powi.v16f64(<16 x double>, i32) Index: llvm/test/Transforms/SLPVectorizer/X86/powi.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/X86/powi.ll @@ -0,0 +1,491 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 + +define <2 x double> @buildvector_powi_2f64(<2 x double> %a) { +; CHECK-LABEL: @buildvector_powi_2f64( +; CHECK-NEXT: [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0 +; CHECK-NEXT: [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1 +; CHECK-NEXT: [[C0:%.*]] = call double @llvm.powi.f64.i32(double [[A0]], i32 6) +; CHECK-NEXT: [[C1:%.*]] = call double @llvm.powi.f64.i32(double [[A1]], i32 6) +; CHECK-NEXT: [[R0:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 +; CHECK-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[C1]], i32 1 +; CHECK-NEXT: ret <2 x double> [[R1]] +; + %a0 = extractelement <2 x double> %a, i32 0 + %a1 = extractelement <2 x double> %a, i32 1 + %c0 = call double @llvm.powi.f64(double %a0 , i32 6) + %c1 = call double @llvm.powi.f64(double %a1 , i32 6) + %r0 = insertelement <2 x double> undef, double %c0, i32 0 + %r1 = insertelement <2 x double> %r0, double %c1, i32 1 + ret <2 x double> %r1 +} + +define <4 x float> @buildvector_powi_4f32(<4 x float> %a) { +; SSE-LABEL: @buildvector_powi_4f32( +; SSE-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 6) +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A3]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP5]], i32 6) +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> +; SSE-NEXT: ret <4 x float> [[R31]] +; +; AVX1-LABEL: @buildvector_powi_4f32( +; AVX1-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; AVX1-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 +; AVX1-NEXT: [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 6) +; AVX1-NEXT: [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 6) +; AVX1-NEXT: [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 6) +; AVX1-NEXT: [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 6) +; AVX1-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[C0]], i32 0 +; AVX1-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[C2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[C3]], i32 3 +; AVX1-NEXT: ret <4 x float> [[R3]] +; +; AVX2-LABEL: @buildvector_powi_4f32( +; AVX2-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; AVX2-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; AVX2-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 +; AVX2-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 +; AVX2-NEXT: [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 6) +; AVX2-NEXT: [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 6) +; AVX2-NEXT: [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 6) +; AVX2-NEXT: [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 6) +; AVX2-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[C0]], i32 0 +; AVX2-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1 +; AVX2-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[C2]], i32 2 +; AVX2-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[C3]], i32 3 +; AVX2-NEXT: ret <4 x float> [[R3]] +; +; AVX512-LABEL: @buildvector_powi_4f32( +; AVX512-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; AVX512-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; AVX512-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 +; AVX512-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 +; AVX512-NEXT: [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 6) +; AVX512-NEXT: [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 6) +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0 +; AVX512-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A3]], i32 1 +; AVX512-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 6) +; AVX512-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[C0]], i32 0 +; AVX512-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1 +; AVX512-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; AVX512-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[R1]], <4 x float> [[TMP4]], <4 x i32> +; AVX512-NEXT: ret <4 x float> [[R31]] +; + %a0 = extractelement <4 x float> %a, i32 0 + %a1 = extractelement <4 x float> %a, i32 1 + %a2 = extractelement <4 x float> %a, i32 2 + %a3 = extractelement <4 x float> %a, i32 3 + %c0 = call float @llvm.powi.f32(float %a0 , i32 6) + %c1 = call float @llvm.powi.f32(float %a1 , i32 6) + %c2 = call float @llvm.powi.f32(float %a2 , i32 6) + %c3 = call float @llvm.powi.f32(float %a3 , i32 6) + %r0 = insertelement <4 x float> undef, float %c0, i32 0 + %r1 = insertelement <4 x float> %r0, float %c1, i32 1 + %r2 = insertelement <4 x float> %r1, float %c2, i32 2 + %r3 = insertelement <4 x float> %r2, float %c3, i32 3 + ret <4 x float> %r3 +} + +; +; 256-bit Vectors +; + +define <4 x double> @buildvector_powi_4f64(<4 x double> %a) { +; SSE-LABEL: @buildvector_powi_4f64( +; SSE-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP2]], i32 6) +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A3]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP5]], i32 6) +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> +; SSE-NEXT: [[R31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[R31]] +; +; AVX1-LABEL: @buildvector_powi_4f64( +; AVX1-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 +; AVX1-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; AVX1-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1 +; AVX1-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP2]], i32 6) +; AVX1-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A3]], i32 1 +; AVX1-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP5]], i32 6) +; AVX1-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; AVX1-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> +; AVX1-NEXT: [[R31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> +; AVX1-NEXT: ret <4 x double> [[R31]] +; +; AVX2-LABEL: @buildvector_powi_4f64( +; AVX2-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 6) +; AVX2-NEXT: ret <4 x double> [[TMP1]] +; +; AVX512-LABEL: @buildvector_powi_4f64( +; AVX512-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 6) +; AVX512-NEXT: ret <4 x double> [[TMP1]] +; + %a0 = extractelement <4 x double> %a, i32 0 + %a1 = extractelement <4 x double> %a, i32 1 + %a2 = extractelement <4 x double> %a, i32 2 + %a3 = extractelement <4 x double> %a, i32 3 + %c0 = call double @llvm.powi.f64(double %a0 , i32 6) + %c1 = call double @llvm.powi.f64(double %a1 , i32 6) + %c2 = call double @llvm.powi.f64(double %a2 , i32 6) + %c3 = call double @llvm.powi.f64(double %a3 , i32 6) + %r0 = insertelement <4 x double> undef, double %c0, i32 0 + %r1 = insertelement <4 x double> %r0, double %c1, i32 1 + %r2 = insertelement <4 x double> %r1, double %c2, i32 2 + %r3 = insertelement <4 x double> %r2, double %c3, i32 3 + ret <4 x double> %r3 +} + +define <8 x float> @buildvector_powi_8f32(<8 x float> %a) { +; SSE-LABEL: @buildvector_powi_8f32( +; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[A1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[A2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[A3]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP4]], i32 6) +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x float> poison, float [[A4]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[A5]], i32 1 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[A6]], i32 2 +; SSE-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[A7]], i32 3 +; SSE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP9]], i32 6) +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <8 x i32> +; SSE-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> +; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> [[TMP12]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[R71]] +; +; AVX1-LABEL: @buildvector_powi_8f32( +; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; AVX1-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 +; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 +; AVX1-NEXT: [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 6) +; AVX1-NEXT: [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 6) +; AVX1-NEXT: [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 6) +; AVX1-NEXT: [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 6) +; AVX1-NEXT: [[C4:%.*]] = call float @llvm.powi.f32.i32(float [[A4]], i32 6) +; AVX1-NEXT: [[C5:%.*]] = call float @llvm.powi.f32.i32(float [[A5]], i32 6) +; AVX1-NEXT: [[C6:%.*]] = call float @llvm.powi.f32.i32(float [[A6]], i32 6) +; AVX1-NEXT: [[C7:%.*]] = call float @llvm.powi.f32.i32(float [[A7]], i32 6) +; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[C0]], i32 0 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[C1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[C2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[C3]], i32 3 +; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[C4]], i32 4 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[C5]], i32 5 +; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[C6]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[C7]], i32 7 +; AVX1-NEXT: ret <8 x float> [[R7]] +; +; AVX2-LABEL: @buildvector_powi_8f32( +; AVX2-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 6) +; AVX2-NEXT: ret <8 x float> [[TMP1]] +; +; AVX512-LABEL: @buildvector_powi_8f32( +; AVX512-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 6) +; AVX512-NEXT: ret <8 x float> [[TMP1]] +; + %a0 = extractelement <8 x float> %a, i32 0 + %a1 = extractelement <8 x float> %a, i32 1 + %a2 = extractelement <8 x float> %a, i32 2 + %a3 = extractelement <8 x float> %a, i32 3 + %a4 = extractelement <8 x float> %a, i32 4 + %a5 = extractelement <8 x float> %a, i32 5 + %a6 = extractelement <8 x float> %a, i32 6 + %a7 = extractelement <8 x float> %a, i32 7 + %c0 = call float @llvm.powi.f32(float %a0 , i32 6) + %c1 = call float @llvm.powi.f32(float %a1 , i32 6) + %c2 = call float @llvm.powi.f32(float %a2 , i32 6) + %c3 = call float @llvm.powi.f32(float %a3 , i32 6) + %c4 = call float @llvm.powi.f32(float %a4 , i32 6) + %c5 = call float @llvm.powi.f32(float %a5 , i32 6) + %c6 = call float @llvm.powi.f32(float %a6 , i32 6) + %c7 = call float @llvm.powi.f32(float %a7 , i32 6) + %r0 = insertelement <8 x float> undef, float %c0, i32 0 + %r1 = insertelement <8 x float> %r0, float %c1, i32 1 + %r2 = insertelement <8 x float> %r1, float %c2, i32 2 + %r3 = insertelement <8 x float> %r2, float %c3, i32 3 + %r4 = insertelement <8 x float> %r3, float %c4, i32 4 + %r5 = insertelement <8 x float> %r4, float %c5, i32 5 + %r6 = insertelement <8 x float> %r5, float %c6, i32 6 + %r7 = insertelement <8 x float> %r6, float %c7, i32 7 + ret <8 x float> %r7 +} + +; +; 512-bit Vectors +; + +define <8 x double> @buildvector_powi_8f64(<8 x double> %a) { +; SSE-LABEL: @buildvector_powi_8f64( +; SSE-NEXT: [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP4]], i32 6) +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x double> poison, double [[A4]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[A5]], i32 1 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[A6]], i32 2 +; SSE-NEXT: [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[A7]], i32 3 +; SSE-NEXT: [[TMP10:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP9]], i32 6) +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <8 x i32> +; SSE-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <8 x i32> +; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> [[TMP12]], <8 x i32> +; SSE-NEXT: ret <8 x double> [[R71]] +; +; AVX1-LABEL: @buildvector_powi_8f64( +; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0 +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3 +; AVX1-NEXT: [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5 +; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7 +; AVX1-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0]], i32 0 +; AVX1-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1]], i32 1 +; AVX1-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2]], i32 2 +; AVX1-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3]], i32 3 +; AVX1-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP4]], i32 6) +; AVX1-NEXT: [[TMP6:%.*]] = insertelement <4 x double> poison, double [[A4]], i32 0 +; AVX1-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[A5]], i32 1 +; AVX1-NEXT: [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[A6]], i32 2 +; AVX1-NEXT: [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[A7]], i32 3 +; AVX1-NEXT: [[TMP10:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP9]], i32 6) +; AVX1-NEXT: [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <8 x i32> +; AVX1-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <8 x i32> +; AVX1-NEXT: [[R71:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> [[TMP12]], <8 x i32> +; AVX1-NEXT: ret <8 x double> [[R71]] +; +; AVX2-LABEL: @buildvector_powi_8f64( +; AVX2-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 6) +; AVX2-NEXT: ret <8 x double> [[TMP1]] +; +; AVX512-LABEL: @buildvector_powi_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 6) +; AVX512-NEXT: ret <8 x double> [[TMP1]] +; + %a0 = extractelement <8 x double> %a, i32 0 + %a1 = extractelement <8 x double> %a, i32 1 + %a2 = extractelement <8 x double> %a, i32 2 + %a3 = extractelement <8 x double> %a, i32 3 + %a4 = extractelement <8 x double> %a, i32 4 + %a5 = extractelement <8 x double> %a, i32 5 + %a6 = extractelement <8 x double> %a, i32 6 + %a7 = extractelement <8 x double> %a, i32 7 + %c0 = call double @llvm.powi.f64(double %a0 , i32 6) + %c1 = call double @llvm.powi.f64(double %a1 , i32 6) + %c2 = call double @llvm.powi.f64(double %a2 , i32 6) + %c3 = call double @llvm.powi.f64(double %a3 , i32 6) + %c4 = call double @llvm.powi.f64(double %a4 , i32 6) + %c5 = call double @llvm.powi.f64(double %a5 , i32 6) + %c6 = call double @llvm.powi.f64(double %a6 , i32 6) + %c7 = call double @llvm.powi.f64(double %a7 , i32 6) + %r0 = insertelement <8 x double> undef, double %c0, i32 0 + %r1 = insertelement <8 x double> %r0, double %c1, i32 1 + %r2 = insertelement <8 x double> %r1, double %c2, i32 2 + %r3 = insertelement <8 x double> %r2, double %c3, i32 3 + %r4 = insertelement <8 x double> %r3, double %c4, i32 4 + %r5 = insertelement <8 x double> %r4, double %c5, i32 5 + %r6 = insertelement <8 x double> %r5, double %c6, i32 6 + %r7 = insertelement <8 x double> %r6, double %c7, i32 7 + ret <8 x double> %r7 +} + +define <16 x float> @buildvector_powi_16f32(<16 x float> %a) { +; SSE-LABEL: @buildvector_powi_16f32( +; SSE-NEXT: [[A0:%.*]] = extractelement <16 x float> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <16 x float> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <16 x float> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <16 x float> [[A]], i32 3 +; SSE-NEXT: [[A4:%.*]] = extractelement <16 x float> [[A]], i32 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <16 x float> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <16 x float> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <16 x float> [[A]], i32 7 +; SSE-NEXT: [[A8:%.*]] = extractelement <16 x float> [[A]], i32 8 +; SSE-NEXT: [[A9:%.*]] = extractelement <16 x float> [[A]], i32 9 +; SSE-NEXT: [[A10:%.*]] = extractelement <16 x float> [[A]], i32 10 +; SSE-NEXT: [[A11:%.*]] = extractelement <16 x float> [[A]], i32 11 +; SSE-NEXT: [[A12:%.*]] = extractelement <16 x float> [[A]], i32 12 +; SSE-NEXT: [[A13:%.*]] = extractelement <16 x float> [[A]], i32 13 +; SSE-NEXT: [[A14:%.*]] = extractelement <16 x float> [[A]], i32 14 +; SSE-NEXT: [[A15:%.*]] = extractelement <16 x float> [[A]], i32 15 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[A0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[A1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[A2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[A3]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[A4]], i32 4 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[A5]], i32 5 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[A6]], i32 6 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[A7]], i32 7 +; SSE-NEXT: [[TMP9:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[TMP8]], i32 6) +; SSE-NEXT: [[TMP10:%.*]] = insertelement <8 x float> poison, float [[A8]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <8 x float> [[TMP10]], float [[A9]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[A10]], i32 2 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[A11]], i32 3 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[A12]], i32 4 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[A13]], i32 5 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[A14]], i32 6 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[A15]], i32 7 +; SSE-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[TMP17]], i32 6) +; SSE-NEXT: [[TMP19:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <16 x i32> +; SSE-NEXT: [[TMP20:%.*]] = shufflevector <8 x float> [[TMP18]], <8 x float> poison, <16 x i32> +; SSE-NEXT: [[R151:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> +; SSE-NEXT: ret <16 x float> [[R151]] +; +; AVX1-LABEL: @buildvector_powi_16f32( +; AVX1-NEXT: [[A0:%.*]] = extractelement <16 x float> [[A:%.*]], i32 0 +; AVX1-NEXT: [[A1:%.*]] = extractelement <16 x float> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <16 x float> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <16 x float> [[A]], i32 3 +; AVX1-NEXT: [[A4:%.*]] = extractelement <16 x float> [[A]], i32 4 +; AVX1-NEXT: [[A5:%.*]] = extractelement <16 x float> [[A]], i32 5 +; AVX1-NEXT: [[A6:%.*]] = extractelement <16 x float> [[A]], i32 6 +; AVX1-NEXT: [[A7:%.*]] = extractelement <16 x float> [[A]], i32 7 +; AVX1-NEXT: [[A8:%.*]] = extractelement <16 x float> [[A]], i32 8 +; AVX1-NEXT: [[A9:%.*]] = extractelement <16 x float> [[A]], i32 9 +; AVX1-NEXT: [[A10:%.*]] = extractelement <16 x float> [[A]], i32 10 +; AVX1-NEXT: [[A11:%.*]] = extractelement <16 x float> [[A]], i32 11 +; AVX1-NEXT: [[A12:%.*]] = extractelement <16 x float> [[A]], i32 12 +; AVX1-NEXT: [[A13:%.*]] = extractelement <16 x float> [[A]], i32 13 +; AVX1-NEXT: [[A14:%.*]] = extractelement <16 x float> [[A]], i32 14 +; AVX1-NEXT: [[A15:%.*]] = extractelement <16 x float> [[A]], i32 15 +; AVX1-NEXT: [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 6) +; AVX1-NEXT: [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 6) +; AVX1-NEXT: [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 6) +; AVX1-NEXT: [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 6) +; AVX1-NEXT: [[C4:%.*]] = call float @llvm.powi.f32.i32(float [[A4]], i32 6) +; AVX1-NEXT: [[C5:%.*]] = call float @llvm.powi.f32.i32(float [[A5]], i32 6) +; AVX1-NEXT: [[C6:%.*]] = call float @llvm.powi.f32.i32(float [[A6]], i32 6) +; AVX1-NEXT: [[C7:%.*]] = call float @llvm.powi.f32.i32(float [[A7]], i32 6) +; AVX1-NEXT: [[C8:%.*]] = call float @llvm.powi.f32.i32(float [[A8]], i32 6) +; AVX1-NEXT: [[C9:%.*]] = call float @llvm.powi.f32.i32(float [[A9]], i32 6) +; AVX1-NEXT: [[C10:%.*]] = call float @llvm.powi.f32.i32(float [[A10]], i32 6) +; AVX1-NEXT: [[C11:%.*]] = call float @llvm.powi.f32.i32(float [[A11]], i32 6) +; AVX1-NEXT: [[C12:%.*]] = call float @llvm.powi.f32.i32(float [[A12]], i32 6) +; AVX1-NEXT: [[C13:%.*]] = call float @llvm.powi.f32.i32(float [[A13]], i32 6) +; AVX1-NEXT: [[C14:%.*]] = call float @llvm.powi.f32.i32(float [[A14]], i32 6) +; AVX1-NEXT: [[C15:%.*]] = call float @llvm.powi.f32.i32(float [[A15]], i32 6) +; AVX1-NEXT: [[R0:%.*]] = insertelement <16 x float> undef, float [[C0]], i32 0 +; AVX1-NEXT: [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[C1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[C2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[C3]], i32 3 +; AVX1-NEXT: [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[C4]], i32 4 +; AVX1-NEXT: [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[C5]], i32 5 +; AVX1-NEXT: [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[C6]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[C7]], i32 7 +; AVX1-NEXT: [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[C8]], i32 8 +; AVX1-NEXT: [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[C9]], i32 9 +; AVX1-NEXT: [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[C10]], i32 10 +; AVX1-NEXT: [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[C11]], i32 11 +; AVX1-NEXT: [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[C12]], i32 12 +; AVX1-NEXT: [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[C13]], i32 13 +; AVX1-NEXT: [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[C14]], i32 14 +; AVX1-NEXT: [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[C15]], i32 15 +; AVX1-NEXT: ret <16 x float> [[R15]] +; +; AVX2-LABEL: @buildvector_powi_16f32( +; AVX2-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 6) +; AVX2-NEXT: ret <16 x float> [[TMP1]] +; +; AVX512-LABEL: @buildvector_powi_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 6) +; AVX512-NEXT: ret <16 x float> [[TMP1]] +; + %a0 = extractelement <16 x float> %a, i32 0 + %a1 = extractelement <16 x float> %a, i32 1 + %a2 = extractelement <16 x float> %a, i32 2 + %a3 = extractelement <16 x float> %a, i32 3 + %a4 = extractelement <16 x float> %a, i32 4 + %a5 = extractelement <16 x float> %a, i32 5 + %a6 = extractelement <16 x float> %a, i32 6 + %a7 = extractelement <16 x float> %a, i32 7 + %a8 = extractelement <16 x float> %a, i32 8 + %a9 = extractelement <16 x float> %a, i32 9 + %a10 = extractelement <16 x float> %a, i32 10 + %a11 = extractelement <16 x float> %a, i32 11 + %a12 = extractelement <16 x float> %a, i32 12 + %a13 = extractelement <16 x float> %a, i32 13 + %a14 = extractelement <16 x float> %a, i32 14 + %a15 = extractelement <16 x float> %a, i32 15 + %c0 = call float @llvm.powi.f32(float %a0 , i32 6) + %c1 = call float @llvm.powi.f32(float %a1 , i32 6) + %c2 = call float @llvm.powi.f32(float %a2 , i32 6) + %c3 = call float @llvm.powi.f32(float %a3 , i32 6) + %c4 = call float @llvm.powi.f32(float %a4 , i32 6) + %c5 = call float @llvm.powi.f32(float %a5 , i32 6) + %c6 = call float @llvm.powi.f32(float %a6 , i32 6) + %c7 = call float @llvm.powi.f32(float %a7 , i32 6) + %c8 = call float @llvm.powi.f32(float %a8 , i32 6) + %c9 = call float @llvm.powi.f32(float %a9 , i32 6) + %c10 = call float @llvm.powi.f32(float %a10 , i32 6) + %c11 = call float @llvm.powi.f32(float %a11 , i32 6) + %c12 = call float @llvm.powi.f32(float %a12 , i32 6) + %c13 = call float @llvm.powi.f32(float %a13 , i32 6) + %c14 = call float @llvm.powi.f32(float %a14 , i32 6) + %c15 = call float @llvm.powi.f32(float %a15 , i32 6) + %r0 = insertelement <16 x float> undef, float %c0 , i32 0 + %r1 = insertelement <16 x float> %r0 , float %c1 , i32 1 + %r2 = insertelement <16 x float> %r1 , float %c2 , i32 2 + %r3 = insertelement <16 x float> %r2 , float %c3 , i32 3 + %r4 = insertelement <16 x float> %r3 , float %c4 , i32 4 + %r5 = insertelement <16 x float> %r4 , float %c5 , i32 5 + %r6 = insertelement <16 x float> %r5 , float %c6 , i32 6 + %r7 = insertelement <16 x float> %r6 , float %c7 , i32 7 + %r8 = insertelement <16 x float> %r7 , float %c8 , i32 8 + %r9 = insertelement <16 x float> %r8 , float %c9 , i32 9 + %r10 = insertelement <16 x float> %r9 , float %c10, i32 10 + %r11 = insertelement <16 x float> %r10, float %c11, i32 11 + %r12 = insertelement <16 x float> %r11, float %c12, i32 12 + %r13 = insertelement <16 x float> %r12, float %c13, i32 13 + %r14 = insertelement <16 x float> %r13, float %c14, i32 14 + %r15 = insertelement <16 x float> %r14, float %c15, i32 15 + ret <16 x float> %r15 +} + +declare float @llvm.powi.f32(float, i32) +declare double @llvm.powi.f64(double, i32)