diff --git a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll @@ -0,0 +1,1814 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple armv8---eabi -mattr=+aes,+fix-cortex-a57-aes-1742098 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK-FIX,CHECK-FIX-NOSCHED + +; These CPUs should have the fix enabled by default. They use different +; FileCheck prefixes because some instructions are scheduled differently. +; +; RUN: llc -mtriple armv8---eabi -mcpu=cortex-a57 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK-FIX,CHECK-CORTEX-FIX +; RUN: llc -mtriple armv8---eabi -mcpu=cortex-a72 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK-FIX,CHECK-CORTEX-FIX + +; This checks that adding `+fix-cortex-a57-aes-1742098` causes `vorr` to be +; inserted wherever the compiler cannot prove that either input to the first aes +; instruction in a fused aes pair was set by 64-bit Neon register writes or +; 128-bit Neon register writes. All other register writes are unsafe, and +; require a `vorr` to protect the AES input. + +declare <16 x i8> @llvm.arm.neon.aese(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.aesmc(<16 x i8>) +declare <16 x i8> @llvm.arm.neon.aesd(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.aesimc(<16 x i8>) + + +define arm_aapcs_vfpcc void @aese_zero(<16 x i8>* %0) nounwind { +; CHECK-FIX-LABEL: aese_zero: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-FIX-NEXT: vmov.i32 q9, #0x0 +; CHECK-FIX-NEXT: aese.8 q9, q8 +; CHECK-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-FIX-NEXT: bx lr + %2 = load <16 x i8>, <16 x i8>* %0, align 8 + %3 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> zeroinitializer, <16 x i8> %2) + %4 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %3) + store <16 x i8> %4, <16 x i8>* %0, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_once_via_ptr(<16 x i8>* %0, <16 x i8>* %1) nounwind { +; CHECK-FIX-LABEL: aese_once_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: aese.8 q9, q8 +; CHECK-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %3 = load <16 x i8>, <16 x i8>* %1, align 8 + %4 = load <16 x i8>, <16 x i8>* %0, align 8 + %5 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %3, <16 x i8> %4) + %6 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %5) + store <16 x i8> %6, <16 x i8>* %1, align 8 + ret void +} + +define arm_aapcs_vfpcc <16 x i8> @aese_once_via_val(<16 x i8> %0, <16 x i8> %1) nounwind { +; CHECK-FIX-LABEL: aese_once_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: aese.8 q1, q0 +; CHECK-FIX-NEXT: aesmc.8 q0, q1 +; CHECK-FIX-NEXT: bx lr + %3 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %1, <16 x i8> %0) + %4 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %3) + ret <16 x i8> %4 +} + +define arm_aapcs_vfpcc void @aese_twice_via_ptr(<16 x i8>* %0, <16 x i8>* %1) nounwind { +; CHECK-FIX-LABEL: aese_twice_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: aese.8 q9, q8 +; CHECK-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-FIX-NEXT: aese.8 q8, q9 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %3 = load <16 x i8>, <16 x i8>* %1, align 8 + %4 = load <16 x i8>, <16 x i8>* %0, align 8 + %5 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %3, <16 x i8> %4) + %6 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %5) + store <16 x i8> %6, <16 x i8>* %1, align 8 + %7 = load <16 x i8>, <16 x i8>* %0, align 8 + %8 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %6, <16 x i8> %7) + %9 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %1, align 8 + ret void +} + +define arm_aapcs_vfpcc <16 x i8> @aese_twice_via_val(<16 x i8> %0, <16 x i8> %1) nounwind { +; CHECK-FIX-LABEL: aese_twice_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: aese.8 q1, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q1 +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q0, q8 +; CHECK-FIX-NEXT: bx lr + %3 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %1, <16 x i8> %0) + %4 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %3) + %5 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %4, <16 x i8> %0) + %6 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %5) + ret <16 x i8> %6 +} + +define arm_aapcs_vfpcc void @aese_loop_via_ptr(i32 %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_loop_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB5_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: bne .LBB5_1 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.2: +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_loop_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB5_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: bne .LBB5_1 +; CHECK-CORTEX-FIX-NEXT: @ %bb.2: +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = icmp eq i32 %0, 0 + br i1 %4, label %5, label %6 + +5: + ret void + +6: + %7 = phi i32 [ %12, %6 ], [ 0, %3 ] + %8 = load <16 x i8>, <16 x i8>* %2, align 8 + %9 = load <16 x i8>, <16 x i8>* %1, align 8 + %10 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %8, <16 x i8> %9) + %11 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %2, align 8 + %12 = add nuw i32 %7, 1 + %13 = icmp eq i32 %12, %0 + br i1 %13, label %5, label %6 +} + +define arm_aapcs_vfpcc <16 x i8> @aese_loop_via_val(i32 %0, <16 x i8> %1, <16 x i8> %2) nounwind { +; CHECK-FIX-LABEL: aese_loop_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB6_2 +; CHECK-FIX-NEXT: .LBB6_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aese.8 q1, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesmc.8 q1, q1 +; CHECK-FIX-NEXT: bne .LBB6_1 +; CHECK-FIX-NEXT: .LBB6_2: +; CHECK-FIX-NEXT: vorr q0, q1, q1 +; CHECK-FIX-NEXT: bx lr + %4 = icmp eq i32 %0, 0 + br i1 %4, label %5, label %7 + +5: + %6 = phi <16 x i8> [ %2, %3 ], [ %11, %7 ] + ret <16 x i8> %6 + +7: + %8 = phi i32 [ %12, %7 ], [ 0, %3 ] + %9 = phi <16 x i8> [ %11, %7 ], [ %2, %3 ] + %10 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %9, <16 x i8> %1) + %11 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %10) + %12 = add nuw i32 %8, 1 + %13 = icmp eq i32 %12, %0 + br i1 %13, label %5, label %7 +} + +define arm_aapcs_vfpcc void @aese_set8_via_ptr(i8* %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aese_set8_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.8 {d0[0]}, [r0] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i8, i8* %0, align 1 + %6 = insertelement <16 x i8> %1, i8 %5, i64 0 + %7 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %4, <16 x i8> %6) + %8 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %7) + store <16 x i8> %8, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set8_via_val(i8 zeroext %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aese_set8_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: vmov.8 d16[0], r0 +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = insertelement <16 x i8> %4, i8 %0, i64 0 + %6 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %5, <16 x i8> %1) + %7 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %6) + store <16 x i8> %7, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set8_cond_via_ptr(i1 zeroext %0, i8* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set8_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB9_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vld1.8 {d0[0]}, [r1] +; CHECK-FIX-NEXT: .LBB9_2: +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %9 + +6: + %7 = load i8, i8* %1, align 1 + %8 = insertelement <16 x i8> %2, i8 %7, i64 0 + br label %9 + +9: + %10 = phi <16 x i8> [ %8, %6 ], [ %2, %4 ] + %11 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %5, <16 x i8> %10) + %12 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %11) + store <16 x i8> %12, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set8_cond_via_val(i1 zeroext %0, i8 zeroext %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set8_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB10_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.8 d16[0], r1 +; CHECK-FIX-NEXT: .LBB10_2: @ %select.end +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + %6 = insertelement <16 x i8> %5, i8 %1, i64 0 + %7 = select i1 %0, <16 x i8> %6, <16 x i8> %5 + %8 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %7, <16 x i8> %2) + %9 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set8_loop_via_ptr(i32 %0, i8* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set8_loop_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB11_1: +; CHECK-FIX-NEXT: vld1.8 {d0[0]}, [r1] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB11_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB11_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load i8, i8* %1, align 1 + %6 = insertelement <16 x i8> %2, i8 %5, i64 0 + %7 = icmp eq i32 %0, 0 + br i1 %7, label %11, label %8 + +8: + %9 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %12 + +10: + store <16 x i8> %16, <16 x i8>* %3, align 8 + br label %11 + +11: + ret void + +12: + %13 = phi <16 x i8> [ %9, %8 ], [ %16, %12 ] + %14 = phi i32 [ 0, %8 ], [ %17, %12 ] + %15 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %13, <16 x i8> %6) + %16 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %15) + %17 = add nuw i32 %14, 1 + %18 = icmp eq i32 %17, %0 + br i1 %18, label %10, label %12 +} + +define arm_aapcs_vfpcc void @aese_set8_loop_via_val(i32 %0, i8 zeroext %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set8_loop_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB12_1: +; CHECK-FIX-NEXT: vmov.8 d0[0], r1 +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB12_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB12_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = icmp eq i32 %0, 0 + br i1 %5, label %10, label %6 + +6: + %7 = insertelement <16 x i8> %2, i8 %1, i64 0 + %8 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %11 + +9: + store <16 x i8> %15, <16 x i8>* %3, align 8 + br label %10 + +10: + ret void + +11: + %12 = phi <16 x i8> [ %8, %6 ], [ %15, %11 ] + %13 = phi i32 [ 0, %6 ], [ %16, %11 ] + %14 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %12, <16 x i8> %7) + %15 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %14) + %16 = add nuw i32 %13, 1 + %17 = icmp eq i32 %16, %0 + br i1 %17, label %9, label %11 +} + +define arm_aapcs_vfpcc void @aese_set16_via_ptr(i16* %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aese_set16_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.16 {d0[0]}, [r0:16] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i16, i16* %0, align 2 + %6 = bitcast <16 x i8> %1 to <8 x i16> + %7 = insertelement <8 x i16> %6, i16 %5, i64 0 + %8 = bitcast <8 x i16> %7 to <16 x i8> + %9 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %4, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set16_via_val(i16 zeroext %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aese_set16_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: vmov.16 d16[0], r0 +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %4 = bitcast <16 x i8>* %2 to <8 x i16>* + %5 = load <8 x i16>, <8 x i16>* %4, align 8 + %6 = insertelement <8 x i16> %5, i16 %0, i64 0 + %7 = bitcast <8 x i16> %6 to <16 x i8> + %8 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %7, <16 x i8> %1) + %9 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set16_cond_via_ptr(i1 zeroext %0, i16* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set16_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB15_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vld1.16 {d0[0]}, [r1:16] +; CHECK-FIX-NEXT: .LBB15_2: +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %10 + +6: + %7 = load i16, i16* %1, align 2 + %8 = bitcast <16 x i8> %2 to <8 x i16> + %9 = insertelement <8 x i16> %8, i16 %7, i64 0 + br label %12 + +10: + %11 = bitcast <16 x i8> %2 to <8 x i16> + br label %12 + +12: + %13 = phi <8 x i16> [ %9, %6 ], [ %11, %10 ] + %14 = bitcast <8 x i16> %13 to <16 x i8> + %15 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %5, <16 x i8> %14) + %16 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %15) + store <16 x i8> %16, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set16_cond_via_val(i1 zeroext %0, i16 zeroext %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set16_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB16_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.16 d16[0], r1 +; CHECK-FIX-NEXT: .LBB16_2: @ %select.end +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %3 to <8 x i16>* + %6 = load <8 x i16>, <8 x i16>* %5, align 8 + %7 = insertelement <8 x i16> %6, i16 %1, i64 0 + %8 = select i1 %0, <8 x i16> %7, <8 x i16> %6 + %9 = bitcast <8 x i16> %8 to <16 x i8> + %10 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %9, <16 x i8> %2) + %11 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set16_loop_via_ptr(i32 %0, i16* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set16_loop_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB17_1: +; CHECK-FIX-NEXT: vld1.16 {d0[0]}, [r1:16] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB17_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB17_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load i16, i16* %1, align 2 + %6 = bitcast <16 x i8> %2 to <8 x i16> + %7 = insertelement <8 x i16> %6, i16 %5, i64 0 + %8 = bitcast <8 x i16> %7 to <16 x i8> + %9 = icmp eq i32 %0, 0 + br i1 %9, label %13, label %10 + +10: + %11 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %14 + +12: + store <16 x i8> %18, <16 x i8>* %3, align 8 + br label %13 + +13: + ret void + +14: + %15 = phi <16 x i8> [ %11, %10 ], [ %18, %14 ] + %16 = phi i32 [ 0, %10 ], [ %19, %14 ] + %17 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %15, <16 x i8> %8) + %18 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %17) + %19 = add nuw i32 %16, 1 + %20 = icmp eq i32 %19, %0 + br i1 %20, label %12, label %14 +} + +define arm_aapcs_vfpcc void @aese_set16_loop_via_val(i32 %0, i16 zeroext %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set16_loop_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB18_1: +; CHECK-FIX-NEXT: vmov.16 d0[0], r1 +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB18_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB18_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = icmp eq i32 %0, 0 + br i1 %5, label %12, label %6 + +6: + %7 = bitcast <16 x i8> %2 to <8 x i16> + %8 = insertelement <8 x i16> %7, i16 %1, i64 0 + %9 = bitcast <8 x i16> %8 to <16 x i8> + %10 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %13 + +11: + store <16 x i8> %17, <16 x i8>* %3, align 8 + br label %12 + +12: + ret void + +13: + %14 = phi <16 x i8> [ %10, %6 ], [ %17, %13 ] + %15 = phi i32 [ 0, %6 ], [ %18, %13 ] + %16 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %14, <16 x i8> %9) + %17 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %16) + %18 = add nuw i32 %15, 1 + %19 = icmp eq i32 %18, %0 + br i1 %19, label %11, label %13 +} + +define arm_aapcs_vfpcc void @aese_set32_via_ptr(i32* %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aese_set32_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.32 {d0[0]}, [r0:32] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i32, i32* %0, align 4 + %6 = bitcast <16 x i8> %1 to <4 x i32> + %7 = insertelement <4 x i32> %6, i32 %5, i64 0 + %8 = bitcast <4 x i32> %7 to <16 x i8> + %9 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %4, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set32_via_val(i32 %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aese_set32_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: vmov.32 d16[0], r0 +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %4 = bitcast <16 x i8>* %2 to <4 x i32>* + %5 = load <4 x i32>, <4 x i32>* %4, align 8 + %6 = insertelement <4 x i32> %5, i32 %0, i64 0 + %7 = bitcast <4 x i32> %6 to <16 x i8> + %8 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %7, <16 x i8> %1) + %9 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set32_cond_via_ptr(i1 zeroext %0, i32* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set32_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB21_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vld1.32 {d0[0]}, [r1:32] +; CHECK-FIX-NEXT: .LBB21_2: +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %10 + +6: + %7 = load i32, i32* %1, align 4 + %8 = bitcast <16 x i8> %2 to <4 x i32> + %9 = insertelement <4 x i32> %8, i32 %7, i64 0 + br label %12 + +10: + %11 = bitcast <16 x i8> %2 to <4 x i32> + br label %12 + +12: + %13 = phi <4 x i32> [ %9, %6 ], [ %11, %10 ] + %14 = bitcast <4 x i32> %13 to <16 x i8> + %15 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %5, <16 x i8> %14) + %16 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %15) + store <16 x i8> %16, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set32_cond_via_val(i1 zeroext %0, i32 %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set32_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB22_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.32 d16[0], r1 +; CHECK-FIX-NEXT: .LBB22_2: @ %select.end +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %3 to <4 x i32>* + %6 = load <4 x i32>, <4 x i32>* %5, align 8 + %7 = insertelement <4 x i32> %6, i32 %1, i64 0 + %8 = select i1 %0, <4 x i32> %7, <4 x i32> %6 + %9 = bitcast <4 x i32> %8 to <16 x i8> + %10 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %9, <16 x i8> %2) + %11 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set32_loop_via_ptr(i32 %0, i32* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set32_loop_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB23_1: +; CHECK-FIX-NEXT: vld1.32 {d0[0]}, [r1:32] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB23_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB23_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load i32, i32* %1, align 4 + %6 = bitcast <16 x i8> %2 to <4 x i32> + %7 = insertelement <4 x i32> %6, i32 %5, i64 0 + %8 = bitcast <4 x i32> %7 to <16 x i8> + %9 = icmp eq i32 %0, 0 + br i1 %9, label %13, label %10 + +10: + %11 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %14 + +12: + store <16 x i8> %18, <16 x i8>* %3, align 8 + br label %13 + +13: + ret void + +14: + %15 = phi <16 x i8> [ %11, %10 ], [ %18, %14 ] + %16 = phi i32 [ 0, %10 ], [ %19, %14 ] + %17 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %15, <16 x i8> %8) + %18 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %17) + %19 = add nuw i32 %16, 1 + %20 = icmp eq i32 %19, %0 + br i1 %20, label %12, label %14 +} + +define arm_aapcs_vfpcc void @aese_set32_loop_via_val(i32 %0, i32 %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set32_loop_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB24_1: +; CHECK-FIX-NEXT: vmov.32 d0[0], r1 +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB24_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB24_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = icmp eq i32 %0, 0 + br i1 %5, label %12, label %6 + +6: + %7 = bitcast <16 x i8> %2 to <4 x i32> + %8 = insertelement <4 x i32> %7, i32 %1, i64 0 + %9 = bitcast <4 x i32> %8 to <16 x i8> + %10 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %13 + +11: + store <16 x i8> %17, <16 x i8>* %3, align 8 + br label %12 + +12: + ret void + +13: + %14 = phi <16 x i8> [ %10, %6 ], [ %17, %13 ] + %15 = phi i32 [ 0, %6 ], [ %18, %13 ] + %16 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %14, <16 x i8> %9) + %17 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %16) + %18 = add nuw i32 %15, 1 + %19 = icmp eq i32 %18, %0 + br i1 %19, label %11, label %13 +} + +define arm_aapcs_vfpcc void @aese_set64_via_ptr(i64* %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set64_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vldr d0, [r0] +; CHECK-FIX-NOSCHED-NEXT: aese.8 q8, q0 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set64_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vldr d0, [r0] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: aese.8 q8, q0 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i64, i64* %0, align 8 + %6 = bitcast <16 x i8> %1 to <2 x i64> + %7 = insertelement <2 x i64> %6, i64 %5, i64 0 + %8 = bitcast <2 x i64> %7 to <16 x i8> + %9 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %4, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set64_via_val(i64 %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aese_set64_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: vmov.32 d16[0], r0 +; CHECK-FIX-NEXT: vmov.32 d16[1], r1 +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %4 = bitcast <16 x i8>* %2 to <2 x i64>* + %5 = load <2 x i64>, <2 x i64>* %4, align 8 + %6 = insertelement <2 x i64> %5, i64 %0, i64 0 + %7 = bitcast <2 x i64> %6 to <16 x i8> + %8 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %7, <16 x i8> %1) + %9 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set64_cond_via_ptr(i1 zeroext %0, i64* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set64_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: vldrne d0, [r1] +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %10 + +6: + %7 = load i64, i64* %1, align 8 + %8 = bitcast <16 x i8> %2 to <2 x i64> + %9 = insertelement <2 x i64> %8, i64 %7, i64 0 + br label %12 + +10: + %11 = bitcast <16 x i8> %2 to <2 x i64> + br label %12 + +12: + %13 = phi <2 x i64> [ %9, %6 ], [ %11, %10 ] + %14 = bitcast <2 x i64> %13 to <16 x i8> + %15 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %5, <16 x i8> %14) + %16 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %15) + store <16 x i8> %16, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set64_cond_via_val(i1 zeroext %0, i64 %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set64_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: ldr r1, [sp] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: beq .LBB28_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.32 d16[0], r2 +; CHECK-FIX-NEXT: vmov.32 d16[1], r3 +; CHECK-FIX-NEXT: .LBB28_2: @ %select.end +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %3 to <2 x i64>* + %6 = load <2 x i64>, <2 x i64>* %5, align 8 + %7 = insertelement <2 x i64> %6, i64 %1, i64 0 + %8 = select i1 %0, <2 x i64> %7, <2 x i64> %6 + %9 = bitcast <2 x i64> %8 to <16 x i8> + %10 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %9, <16 x i8> %2) + %11 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aese_set64_loop_via_ptr(i32 %0, i64* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set64_loop_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB29_1: +; CHECK-FIX-NEXT: vldr d0, [r1] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB29_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB29_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load i64, i64* %1, align 8 + %6 = bitcast <16 x i8> %2 to <2 x i64> + %7 = insertelement <2 x i64> %6, i64 %5, i64 0 + %8 = bitcast <2 x i64> %7 to <16 x i8> + %9 = icmp eq i32 %0, 0 + br i1 %9, label %13, label %10 + +10: + %11 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %14 + +12: + store <16 x i8> %18, <16 x i8>* %3, align 8 + br label %13 + +13: + ret void + +14: + %15 = phi <16 x i8> [ %11, %10 ], [ %18, %14 ] + %16 = phi i32 [ 0, %10 ], [ %19, %14 ] + %17 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %15, <16 x i8> %8) + %18 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %17) + %19 = add nuw i32 %16, 1 + %20 = icmp eq i32 %19, %0 + br i1 %20, label %12, label %14 +} + +define arm_aapcs_vfpcc void @aese_set64_loop_via_val(i32 %0, i64 %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set64_loop_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB30_1: +; CHECK-FIX-NEXT: vmov.32 d0[0], r2 +; CHECK-FIX-NEXT: ldr r1, [sp] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: vmov.32 d0[1], r3 +; CHECK-FIX-NEXT: .LBB30_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aese.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB30_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %5 = icmp eq i32 %0, 0 + br i1 %5, label %12, label %6 + +6: + %7 = bitcast <16 x i8> %2 to <2 x i64> + %8 = insertelement <2 x i64> %7, i64 %1, i64 0 + %9 = bitcast <2 x i64> %8 to <16 x i8> + %10 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %13 + +11: + store <16 x i8> %17, <16 x i8>* %3, align 8 + br label %12 + +12: + ret void + +13: + %14 = phi <16 x i8> [ %10, %6 ], [ %17, %13 ] + %15 = phi i32 [ 0, %6 ], [ %18, %13 ] + %16 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %14, <16 x i8> %9) + %17 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %16) + %18 = add nuw i32 %15, 1 + %19 = icmp eq i32 %18, %0 + br i1 %19, label %11, label %13 +} + +define arm_aapcs_vfpcc void @aesd_zero(<16 x i8>* %0) nounwind { +; CHECK-FIX-LABEL: aesd_zero: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-FIX-NEXT: vmov.i32 q9, #0x0 +; CHECK-FIX-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-FIX-NEXT: bx lr + %2 = load <16 x i8>, <16 x i8>* %0, align 8 + %3 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> zeroinitializer, <16 x i8> %2) + %4 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %3) + store <16 x i8> %4, <16 x i8>* %0, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_once_via_ptr(<16 x i8>* %0, <16 x i8>* %1) nounwind { +; CHECK-FIX-LABEL: aesd_once_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %3 = load <16 x i8>, <16 x i8>* %1, align 8 + %4 = load <16 x i8>, <16 x i8>* %0, align 8 + %5 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %3, <16 x i8> %4) + %6 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %5) + store <16 x i8> %6, <16 x i8>* %1, align 8 + ret void +} + +define arm_aapcs_vfpcc <16 x i8> @aesd_once_via_val(<16 x i8> %0, <16 x i8> %1) nounwind { +; CHECK-FIX-LABEL: aesd_once_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: aesd.8 q1, q0 +; CHECK-FIX-NEXT: aesimc.8 q0, q1 +; CHECK-FIX-NEXT: bx lr + %3 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %1, <16 x i8> %0) + %4 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %3) + ret <16 x i8> %4 +} + +define arm_aapcs_vfpcc void @aesd_twice_via_ptr(<16 x i8>* %0, <16 x i8>* %1) nounwind { +; CHECK-FIX-LABEL: aesd_twice_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-FIX-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %3 = load <16 x i8>, <16 x i8>* %1, align 8 + %4 = load <16 x i8>, <16 x i8>* %0, align 8 + %5 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %3, <16 x i8> %4) + %6 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %5) + store <16 x i8> %6, <16 x i8>* %1, align 8 + %7 = load <16 x i8>, <16 x i8>* %0, align 8 + %8 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %6, <16 x i8> %7) + %9 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %1, align 8 + ret void +} + +define arm_aapcs_vfpcc <16 x i8> @aesd_twice_via_val(<16 x i8> %0, <16 x i8> %1) nounwind { +; CHECK-FIX-LABEL: aesd_twice_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: aesd.8 q1, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q1 +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q0, q8 +; CHECK-FIX-NEXT: bx lr + %3 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %1, <16 x i8> %0) + %4 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %3) + %5 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %4, <16 x i8> %0) + %6 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %5) + ret <16 x i8> %6 +} + +define arm_aapcs_vfpcc void @aesd_loop_via_ptr(i32 %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_loop_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB36_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: bne .LBB36_1 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.2: +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_loop_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB36_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: bne .LBB36_1 +; CHECK-CORTEX-FIX-NEXT: @ %bb.2: +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = icmp eq i32 %0, 0 + br i1 %4, label %5, label %6 + +5: + ret void + +6: + %7 = phi i32 [ %12, %6 ], [ 0, %3 ] + %8 = load <16 x i8>, <16 x i8>* %2, align 8 + %9 = load <16 x i8>, <16 x i8>* %1, align 8 + %10 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %8, <16 x i8> %9) + %11 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %2, align 8 + %12 = add nuw i32 %7, 1 + %13 = icmp eq i32 %12, %0 + br i1 %13, label %5, label %6 +} + +define arm_aapcs_vfpcc <16 x i8> @aesd_loop_via_val(i32 %0, <16 x i8> %1, <16 x i8> %2) nounwind { +; CHECK-FIX-LABEL: aesd_loop_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB37_2 +; CHECK-FIX-NEXT: .LBB37_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aesd.8 q1, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesimc.8 q1, q1 +; CHECK-FIX-NEXT: bne .LBB37_1 +; CHECK-FIX-NEXT: .LBB37_2: +; CHECK-FIX-NEXT: vorr q0, q1, q1 +; CHECK-FIX-NEXT: bx lr + %4 = icmp eq i32 %0, 0 + br i1 %4, label %5, label %7 + +5: + %6 = phi <16 x i8> [ %2, %3 ], [ %11, %7 ] + ret <16 x i8> %6 + +7: + %8 = phi i32 [ %12, %7 ], [ 0, %3 ] + %9 = phi <16 x i8> [ %11, %7 ], [ %2, %3 ] + %10 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %9, <16 x i8> %1) + %11 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %10) + %12 = add nuw i32 %8, 1 + %13 = icmp eq i32 %12, %0 + br i1 %13, label %5, label %7 +} + +define arm_aapcs_vfpcc void @aesd_set8_via_ptr(i8* %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aesd_set8_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.8 {d0[0]}, [r0] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i8, i8* %0, align 1 + %6 = insertelement <16 x i8> %1, i8 %5, i64 0 + %7 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %4, <16 x i8> %6) + %8 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %7) + store <16 x i8> %8, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set8_via_val(i8 zeroext %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aesd_set8_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: vmov.8 d16[0], r0 +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = insertelement <16 x i8> %4, i8 %0, i64 0 + %6 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %5, <16 x i8> %1) + %7 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %6) + store <16 x i8> %7, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set8_cond_via_ptr(i1 zeroext %0, i8* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set8_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB40_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vld1.8 {d0[0]}, [r1] +; CHECK-FIX-NEXT: .LBB40_2: +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %9 + +6: + %7 = load i8, i8* %1, align 1 + %8 = insertelement <16 x i8> %2, i8 %7, i64 0 + br label %9 + +9: + %10 = phi <16 x i8> [ %8, %6 ], [ %2, %4 ] + %11 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %5, <16 x i8> %10) + %12 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %11) + store <16 x i8> %12, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set8_cond_via_val(i1 zeroext %0, i8 zeroext %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set8_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB41_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.8 d16[0], r1 +; CHECK-FIX-NEXT: .LBB41_2: @ %select.end +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + %6 = insertelement <16 x i8> %5, i8 %1, i64 0 + %7 = select i1 %0, <16 x i8> %6, <16 x i8> %5 + %8 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %7, <16 x i8> %2) + %9 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set8_loop_via_ptr(i32 %0, i8* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set8_loop_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB42_1: +; CHECK-FIX-NEXT: vld1.8 {d0[0]}, [r1] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB42_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB42_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load i8, i8* %1, align 1 + %6 = insertelement <16 x i8> %2, i8 %5, i64 0 + %7 = icmp eq i32 %0, 0 + br i1 %7, label %11, label %8 + +8: + %9 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %12 + +10: + store <16 x i8> %16, <16 x i8>* %3, align 8 + br label %11 + +11: + ret void + +12: + %13 = phi <16 x i8> [ %9, %8 ], [ %16, %12 ] + %14 = phi i32 [ 0, %8 ], [ %17, %12 ] + %15 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %13, <16 x i8> %6) + %16 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %15) + %17 = add nuw i32 %14, 1 + %18 = icmp eq i32 %17, %0 + br i1 %18, label %10, label %12 +} + +define arm_aapcs_vfpcc void @aesd_set8_loop_via_val(i32 %0, i8 zeroext %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set8_loop_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB43_1: +; CHECK-FIX-NEXT: vmov.8 d0[0], r1 +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB43_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB43_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = icmp eq i32 %0, 0 + br i1 %5, label %10, label %6 + +6: + %7 = insertelement <16 x i8> %2, i8 %1, i64 0 + %8 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %11 + +9: + store <16 x i8> %15, <16 x i8>* %3, align 8 + br label %10 + +10: + ret void + +11: + %12 = phi <16 x i8> [ %8, %6 ], [ %15, %11 ] + %13 = phi i32 [ 0, %6 ], [ %16, %11 ] + %14 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %12, <16 x i8> %7) + %15 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %14) + %16 = add nuw i32 %13, 1 + %17 = icmp eq i32 %16, %0 + br i1 %17, label %9, label %11 +} + +define arm_aapcs_vfpcc void @aesd_set16_via_ptr(i16* %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aesd_set16_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.16 {d0[0]}, [r0:16] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i16, i16* %0, align 2 + %6 = bitcast <16 x i8> %1 to <8 x i16> + %7 = insertelement <8 x i16> %6, i16 %5, i64 0 + %8 = bitcast <8 x i16> %7 to <16 x i8> + %9 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %4, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set16_via_val(i16 zeroext %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aesd_set16_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: vmov.16 d16[0], r0 +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %4 = bitcast <16 x i8>* %2 to <8 x i16>* + %5 = load <8 x i16>, <8 x i16>* %4, align 8 + %6 = insertelement <8 x i16> %5, i16 %0, i64 0 + %7 = bitcast <8 x i16> %6 to <16 x i8> + %8 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %7, <16 x i8> %1) + %9 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set16_cond_via_ptr(i1 zeroext %0, i16* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set16_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB46_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vld1.16 {d0[0]}, [r1:16] +; CHECK-FIX-NEXT: .LBB46_2: +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %10 + +6: + %7 = load i16, i16* %1, align 2 + %8 = bitcast <16 x i8> %2 to <8 x i16> + %9 = insertelement <8 x i16> %8, i16 %7, i64 0 + br label %12 + +10: + %11 = bitcast <16 x i8> %2 to <8 x i16> + br label %12 + +12: + %13 = phi <8 x i16> [ %9, %6 ], [ %11, %10 ] + %14 = bitcast <8 x i16> %13 to <16 x i8> + %15 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %5, <16 x i8> %14) + %16 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %15) + store <16 x i8> %16, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set16_cond_via_val(i1 zeroext %0, i16 zeroext %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set16_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB47_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.16 d16[0], r1 +; CHECK-FIX-NEXT: .LBB47_2: @ %select.end +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %3 to <8 x i16>* + %6 = load <8 x i16>, <8 x i16>* %5, align 8 + %7 = insertelement <8 x i16> %6, i16 %1, i64 0 + %8 = select i1 %0, <8 x i16> %7, <8 x i16> %6 + %9 = bitcast <8 x i16> %8 to <16 x i8> + %10 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %9, <16 x i8> %2) + %11 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set16_loop_via_ptr(i32 %0, i16* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set16_loop_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB48_1: +; CHECK-FIX-NEXT: vld1.16 {d0[0]}, [r1:16] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB48_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB48_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load i16, i16* %1, align 2 + %6 = bitcast <16 x i8> %2 to <8 x i16> + %7 = insertelement <8 x i16> %6, i16 %5, i64 0 + %8 = bitcast <8 x i16> %7 to <16 x i8> + %9 = icmp eq i32 %0, 0 + br i1 %9, label %13, label %10 + +10: + %11 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %14 + +12: + store <16 x i8> %18, <16 x i8>* %3, align 8 + br label %13 + +13: + ret void + +14: + %15 = phi <16 x i8> [ %11, %10 ], [ %18, %14 ] + %16 = phi i32 [ 0, %10 ], [ %19, %14 ] + %17 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %15, <16 x i8> %8) + %18 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %17) + %19 = add nuw i32 %16, 1 + %20 = icmp eq i32 %19, %0 + br i1 %20, label %12, label %14 +} + +define arm_aapcs_vfpcc void @aesd_set16_loop_via_val(i32 %0, i16 zeroext %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set16_loop_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB49_1: +; CHECK-FIX-NEXT: vmov.16 d0[0], r1 +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB49_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB49_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = icmp eq i32 %0, 0 + br i1 %5, label %12, label %6 + +6: + %7 = bitcast <16 x i8> %2 to <8 x i16> + %8 = insertelement <8 x i16> %7, i16 %1, i64 0 + %9 = bitcast <8 x i16> %8 to <16 x i8> + %10 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %13 + +11: + store <16 x i8> %17, <16 x i8>* %3, align 8 + br label %12 + +12: + ret void + +13: + %14 = phi <16 x i8> [ %10, %6 ], [ %17, %13 ] + %15 = phi i32 [ 0, %6 ], [ %18, %13 ] + %16 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %14, <16 x i8> %9) + %17 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %16) + %18 = add nuw i32 %15, 1 + %19 = icmp eq i32 %18, %0 + br i1 %19, label %11, label %13 +} + +define arm_aapcs_vfpcc void @aesd_set32_via_ptr(i32* %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aesd_set32_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.32 {d0[0]}, [r0:32] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i32, i32* %0, align 4 + %6 = bitcast <16 x i8> %1 to <4 x i32> + %7 = insertelement <4 x i32> %6, i32 %5, i64 0 + %8 = bitcast <4 x i32> %7 to <16 x i8> + %9 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %4, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set32_via_val(i32 %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aesd_set32_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: vmov.32 d16[0], r0 +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %4 = bitcast <16 x i8>* %2 to <4 x i32>* + %5 = load <4 x i32>, <4 x i32>* %4, align 8 + %6 = insertelement <4 x i32> %5, i32 %0, i64 0 + %7 = bitcast <4 x i32> %6 to <16 x i8> + %8 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %7, <16 x i8> %1) + %9 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set32_cond_via_ptr(i1 zeroext %0, i32* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set32_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB52_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vld1.32 {d0[0]}, [r1:32] +; CHECK-FIX-NEXT: .LBB52_2: +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %10 + +6: + %7 = load i32, i32* %1, align 4 + %8 = bitcast <16 x i8> %2 to <4 x i32> + %9 = insertelement <4 x i32> %8, i32 %7, i64 0 + br label %12 + +10: + %11 = bitcast <16 x i8> %2 to <4 x i32> + br label %12 + +12: + %13 = phi <4 x i32> [ %9, %6 ], [ %11, %10 ] + %14 = bitcast <4 x i32> %13 to <16 x i8> + %15 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %5, <16 x i8> %14) + %16 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %15) + store <16 x i8> %16, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set32_cond_via_val(i1 zeroext %0, i32 %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set32_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB53_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.32 d16[0], r1 +; CHECK-FIX-NEXT: .LBB53_2: @ %select.end +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %3 to <4 x i32>* + %6 = load <4 x i32>, <4 x i32>* %5, align 8 + %7 = insertelement <4 x i32> %6, i32 %1, i64 0 + %8 = select i1 %0, <4 x i32> %7, <4 x i32> %6 + %9 = bitcast <4 x i32> %8 to <16 x i8> + %10 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %9, <16 x i8> %2) + %11 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set32_loop_via_ptr(i32 %0, i32* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set32_loop_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB54_1: +; CHECK-FIX-NEXT: vld1.32 {d0[0]}, [r1:32] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB54_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB54_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load i32, i32* %1, align 4 + %6 = bitcast <16 x i8> %2 to <4 x i32> + %7 = insertelement <4 x i32> %6, i32 %5, i64 0 + %8 = bitcast <4 x i32> %7 to <16 x i8> + %9 = icmp eq i32 %0, 0 + br i1 %9, label %13, label %10 + +10: + %11 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %14 + +12: + store <16 x i8> %18, <16 x i8>* %3, align 8 + br label %13 + +13: + ret void + +14: + %15 = phi <16 x i8> [ %11, %10 ], [ %18, %14 ] + %16 = phi i32 [ 0, %10 ], [ %19, %14 ] + %17 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %15, <16 x i8> %8) + %18 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %17) + %19 = add nuw i32 %16, 1 + %20 = icmp eq i32 %19, %0 + br i1 %20, label %12, label %14 +} + +define arm_aapcs_vfpcc void @aesd_set32_loop_via_val(i32 %0, i32 %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set32_loop_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB55_1: +; CHECK-FIX-NEXT: vmov.32 d0[0], r1 +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB55_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB55_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = icmp eq i32 %0, 0 + br i1 %5, label %12, label %6 + +6: + %7 = bitcast <16 x i8> %2 to <4 x i32> + %8 = insertelement <4 x i32> %7, i32 %1, i64 0 + %9 = bitcast <4 x i32> %8 to <16 x i8> + %10 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %13 + +11: + store <16 x i8> %17, <16 x i8>* %3, align 8 + br label %12 + +12: + ret void + +13: + %14 = phi <16 x i8> [ %10, %6 ], [ %17, %13 ] + %15 = phi i32 [ 0, %6 ], [ %18, %13 ] + %16 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %14, <16 x i8> %9) + %17 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %16) + %18 = add nuw i32 %15, 1 + %19 = icmp eq i32 %18, %0 + br i1 %19, label %11, label %13 +} + +define arm_aapcs_vfpcc void @aesd_set64_via_ptr(i64* %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set64_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vldr d0, [r0] +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set64_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vldr d0, [r0] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: aesd.8 q8, q0 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i64, i64* %0, align 8 + %6 = bitcast <16 x i8> %1 to <2 x i64> + %7 = insertelement <2 x i64> %6, i64 %5, i64 0 + %8 = bitcast <2 x i64> %7 to <16 x i8> + %9 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %4, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set64_via_val(i64 %0, <16 x i8> %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aesd_set64_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: vmov.32 d16[0], r0 +; CHECK-FIX-NEXT: vmov.32 d16[1], r1 +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %4 = bitcast <16 x i8>* %2 to <2 x i64>* + %5 = load <2 x i64>, <2 x i64>* %4, align 8 + %6 = insertelement <2 x i64> %5, i64 %0, i64 0 + %7 = bitcast <2 x i64> %6 to <16 x i8> + %8 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %7, <16 x i8> %1) + %9 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %2, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set64_cond_via_ptr(i1 zeroext %0, i64* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set64_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: vldrne d0, [r1] +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %10 + +6: + %7 = load i64, i64* %1, align 8 + %8 = bitcast <16 x i8> %2 to <2 x i64> + %9 = insertelement <2 x i64> %8, i64 %7, i64 0 + br label %12 + +10: + %11 = bitcast <16 x i8> %2 to <2 x i64> + br label %12 + +12: + %13 = phi <2 x i64> [ %9, %6 ], [ %11, %10 ] + %14 = bitcast <2 x i64> %13 to <16 x i8> + %15 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %5, <16 x i8> %14) + %16 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %15) + store <16 x i8> %16, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set64_cond_via_val(i1 zeroext %0, i64 %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set64_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: ldr r1, [sp] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: beq .LBB59_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.32 d16[0], r2 +; CHECK-FIX-NEXT: vmov.32 d16[1], r3 +; CHECK-FIX-NEXT: .LBB59_2: @ %select.end +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %3 to <2 x i64>* + %6 = load <2 x i64>, <2 x i64>* %5, align 8 + %7 = insertelement <2 x i64> %6, i64 %1, i64 0 + %8 = select i1 %0, <2 x i64> %7, <2 x i64> %6 + %9 = bitcast <2 x i64> %8 to <16 x i8> + %10 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %9, <16 x i8> %2) + %11 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %3, align 8 + ret void +} + +define arm_aapcs_vfpcc void @aesd_set64_loop_via_ptr(i32 %0, i64* %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set64_loop_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB60_1: +; CHECK-FIX-NEXT: vldr d0, [r1] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: .LBB60_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB60_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr + %5 = load i64, i64* %1, align 8 + %6 = bitcast <16 x i8> %2 to <2 x i64> + %7 = insertelement <2 x i64> %6, i64 %5, i64 0 + %8 = bitcast <2 x i64> %7 to <16 x i8> + %9 = icmp eq i32 %0, 0 + br i1 %9, label %13, label %10 + +10: + %11 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %14 + +12: + store <16 x i8> %18, <16 x i8>* %3, align 8 + br label %13 + +13: + ret void + +14: + %15 = phi <16 x i8> [ %11, %10 ], [ %18, %14 ] + %16 = phi i32 [ 0, %10 ], [ %19, %14 ] + %17 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %15, <16 x i8> %8) + %18 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %17) + %19 = add nuw i32 %16, 1 + %20 = icmp eq i32 %19, %0 + br i1 %20, label %12, label %14 +} + +define arm_aapcs_vfpcc void @aesd_set64_loop_via_val(i32 %0, i64 %1, <16 x i8> %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set64_loop_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: bxeq lr +; CHECK-FIX-NEXT: .LBB61_1: +; CHECK-FIX-NEXT: vmov.32 d0[0], r2 +; CHECK-FIX-NEXT: ldr r1, [sp] +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: vmov.32 d0[1], r3 +; CHECK-FIX-NEXT: .LBB61_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: aesd.8 q8, q0 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB61_2 +; CHECK-FIX-NEXT: @ %bb.3: +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr + %5 = icmp eq i32 %0, 0 + br i1 %5, label %12, label %6 + +6: + %7 = bitcast <16 x i8> %2 to <2 x i64> + %8 = insertelement <2 x i64> %7, i64 %1, i64 0 + %9 = bitcast <2 x i64> %8 to <16 x i8> + %10 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %13 + +11: + store <16 x i8> %17, <16 x i8>* %3, align 8 + br label %12 + +12: + ret void + +13: + %14 = phi <16 x i8> [ %10, %6 ], [ %17, %13 ] + %15 = phi i32 [ 0, %6 ], [ %18, %13 ] + %16 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %14, <16 x i8> %9) + %17 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %16) + %18 = add nuw i32 %15, 1 + %19 = icmp eq i32 %18, %0 + br i1 %19, label %11, label %13 +}