Index: clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c =================================================================== --- clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c +++ clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c @@ -78,20 +78,20 @@ // CHECK-LE-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128> // CHECK-LE-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128> // CHECK-LE-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128> -// CHECK-LE-NEXT: [[TMP3:%.*]] = add <1 x i128> [[TMP0]], [[TMP1]] -// CHECK-LE-NEXT: [[VSUBUQM_I:%.*]] = sub <1 x i128> [[TMP2]], [[TMP3]] -// CHECK-LE-NEXT: [[TMP4:%.*]] = bitcast <1 x i128> [[VSUBUQM_I]] to <16 x i8> -// CHECK-LE-NEXT: ret <16 x i8> [[TMP4]] +// CHECK-LE-NEXT: [[VADDUQM_I_NEG:%.*]] = add <1 x i128> [[TMP0]], [[TMP1]] +// CHECK-LE-NEXT: [[VSUBUQM_I:%.*]] = sub <1 x i128> [[TMP2]], [[VADDUQM_I_NEG]] +// CHECK-LE-NEXT: [[TMP3:%.*]] = bitcast <1 x i128> [[VSUBUQM_I]] to <16 x i8> +// CHECK-LE-NEXT: ret <16 x i8> [[TMP3]] // // CHECK-AIX-LABEL: @test_sub( // CHECK-AIX-NEXT: entry: // CHECK-AIX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128> // CHECK-AIX-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128> // CHECK-AIX-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128> -// CHECK-AIX-NEXT: [[TMP3:%.*]] = add <1 x i128> [[TMP0]], [[TMP1]] -// CHECK-AIX-NEXT: [[VSUBUQM_I:%.*]] = sub <1 x i128> [[TMP2]], [[TMP3]] -// CHECK-AIX-NEXT: [[TMP4:%.*]] = bitcast <1 x i128> [[VSUBUQM_I]] to <16 x i8> -// CHECK-AIX-NEXT: ret <16 x i8> [[TMP4]] +// CHECK-AIX-NEXT: [[VADDUQM_I_NEG:%.*]] = add <1 x i128> [[TMP0]], [[TMP1]] +// CHECK-AIX-NEXT: [[VSUBUQM_I:%.*]] = sub <1 x i128> [[TMP2]], [[VADDUQM_I_NEG]] +// CHECK-AIX-NEXT: [[TMP3:%.*]] = bitcast <1 x i128> [[VSUBUQM_I]] to <16 x i8> +// CHECK-AIX-NEXT: ret <16 x i8> [[TMP3]] // vector unsigned char test_sub(vector unsigned char a, vector unsigned char b, vector unsigned char c) { Index: clang/test/CodeGen/aarch64-neon-2velem.c =================================================================== --- clang/test/CodeGen/aarch64-neon-2velem.c +++ clang/test/CodeGen/aarch64-neon-2velem.c @@ -653,7 +653,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4:[0-9]+]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -668,7 +668,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -683,7 +683,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -698,7 +698,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -714,7 +714,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -730,7 +730,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -746,7 +746,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -762,7 +762,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -777,7 +777,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -792,7 +792,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -807,7 +807,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -822,7 +822,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -838,7 +838,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -854,7 +854,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -870,7 +870,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -886,7 +886,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -901,7 +901,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -916,7 +916,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -931,7 +931,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -946,7 +946,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -962,7 +962,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -978,7 +978,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -994,7 +994,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -1010,7 +1010,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -1025,7 +1025,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1040,7 +1040,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -1055,7 +1055,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1070,7 +1070,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -1086,7 +1086,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1102,7 +1102,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -1118,7 +1118,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1134,7 +1134,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -1149,7 +1149,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { @@ -1163,7 +1163,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { @@ -1177,7 +1177,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { @@ -1191,7 +1191,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { @@ -1206,7 +1206,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { @@ -1221,7 +1221,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { @@ -1236,7 +1236,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { @@ -1251,7 +1251,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { @@ -1265,7 +1265,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { @@ -1279,7 +1279,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { @@ -1293,7 +1293,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { @@ -1307,7 +1307,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { @@ -1322,7 +1322,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { @@ -1337,7 +1337,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { @@ -1352,7 +1352,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { @@ -1367,7 +1367,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { @@ -1382,8 +1382,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { @@ -1398,8 +1398,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { @@ -1415,8 +1415,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { @@ -1432,8 +1432,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { @@ -1448,8 +1448,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { @@ -1464,8 +1464,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { @@ -1481,8 +1481,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { @@ -1498,8 +1498,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { @@ -1513,7 +1513,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1528,7 +1528,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1543,7 +1543,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1558,7 +1558,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1574,7 +1574,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1590,7 +1590,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1606,7 +1606,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1622,7 +1622,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1844,7 +1844,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { @@ -1858,7 +1858,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { @@ -1872,7 +1872,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { @@ -1886,7 +1886,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { @@ -1900,7 +1900,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { @@ -1914,7 +1914,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { @@ -2493,7 +2493,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2508,7 +2508,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2523,7 +2523,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2538,7 +2538,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2554,7 +2554,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2570,7 +2570,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2586,7 +2586,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2602,7 +2602,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2617,7 +2617,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2632,7 +2632,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2647,7 +2647,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2662,7 +2662,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2678,7 +2678,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2694,7 +2694,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2710,7 +2710,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2726,7 +2726,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2741,7 +2741,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2756,7 +2756,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2771,7 +2771,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2786,7 +2786,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2802,7 +2802,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2818,7 +2818,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2834,7 +2834,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2850,7 +2850,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2865,7 +2865,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2880,7 +2880,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2895,7 +2895,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2910,7 +2910,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2926,7 +2926,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2942,7 +2942,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2958,7 +2958,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2974,7 +2974,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2989,7 +2989,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) { @@ -3003,7 +3003,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) { @@ -3017,7 +3017,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) { @@ -3031,7 +3031,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) { @@ -3046,7 +3046,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { @@ -3061,7 +3061,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { @@ -3076,7 +3076,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) { @@ -3091,7 +3091,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) { @@ -3105,7 +3105,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) { @@ -3119,7 +3119,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) { @@ -3133,7 +3133,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) { @@ -3147,7 +3147,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) { @@ -3162,7 +3162,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { @@ -3177,7 +3177,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { @@ -3192,7 +3192,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) { @@ -3207,7 +3207,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) { @@ -3222,8 +3222,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { @@ -3238,8 +3238,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { @@ -3255,8 +3255,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { @@ -3272,8 +3272,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { @@ -3288,8 +3288,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { @@ -3304,8 +3304,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { @@ -3321,8 +3321,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { @@ -3338,8 +3338,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { @@ -3353,7 +3353,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -3368,7 +3368,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -3383,7 +3383,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -3398,7 +3398,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -3414,7 +3414,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -3430,7 +3430,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -3446,7 +3446,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -3462,7 +3462,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -3656,7 +3656,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) { @@ -3670,7 +3670,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) { @@ -3684,7 +3684,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) { @@ -3698,7 +3698,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) { @@ -3712,7 +3712,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) { @@ -3726,7 +3726,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { @@ -3736,14 +3736,14 @@ // CHECK-LABEL: @test_vmull_high_n_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4 +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I_I]] // int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) { return vmull_high_n_s16(a, b); @@ -3752,12 +3752,12 @@ // CHECK-LABEL: @test_vmull_high_n_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4 +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I_I]] // int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) { return vmull_high_n_s32(a, b); @@ -3766,14 +3766,14 @@ // CHECK-LABEL: @test_vmull_high_n_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4 +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I_I]] // uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) { return vmull_high_n_u16(a, b); @@ -3782,12 +3782,12 @@ // CHECK-LABEL: @test_vmull_high_n_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4 +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I_I]] // uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) { return vmull_high_n_u32(a, b); @@ -3796,15 +3796,15 @@ // CHECK-LABEL: @test_vqdmull_high_n_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4 +// CHECK-NEXT: [[VQDMULL_V3_I_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I_I]] to <16 x i8> +// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I_I]] // int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) { return vqdmull_high_n_s16(a, b); @@ -3813,13 +3813,13 @@ // CHECK-LABEL: @test_vqdmull_high_n_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8> -// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4 +// CHECK-NEXT: [[VQDMULL_V3_I_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I_I]] to <16 x i8> +// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I_I]] // int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) { return vqdmull_high_n_s32(a, b); @@ -3828,15 +3828,15 @@ // CHECK-LABEL: @test_vmlal_high_n_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] -// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4 +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I_I]] // int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { return vmlal_high_n_s16(a, b, c); @@ -3845,13 +3845,13 @@ // CHECK-LABEL: @test_vmlal_high_n_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] -// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] -// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4 +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I_I]] // int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { return vmlal_high_n_s32(a, b, c); @@ -3860,15 +3860,15 @@ // CHECK-LABEL: @test_vmlal_high_n_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] -// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4 +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I_I]] // uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { return vmlal_high_n_u16(a, b, c); @@ -3877,13 +3877,13 @@ // CHECK-LABEL: @test_vmlal_high_n_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] -// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] -// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4 +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I_I]] // uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { return vmlal_high_n_u32(a, b, c); @@ -3892,16 +3892,16 @@ // CHECK-LABEL: @test_vqdmlal_high_n_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I_I]]) #4 +// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I_I]] // int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { return vqdmlal_high_n_s16(a, b, c); @@ -3910,14 +3910,14 @@ // CHECK-LABEL: @test_vqdmlal_high_n_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I_I]]) #4 +// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I_I]] // int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { return vqdmlal_high_n_s32(a, b, c); @@ -3926,15 +3926,15 @@ // CHECK-LABEL: @test_vmlsl_high_n_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] -// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4 +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I_I]] // int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { return vmlsl_high_n_s16(a, b, c); @@ -3943,13 +3943,13 @@ // CHECK-LABEL: @test_vmlsl_high_n_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] -// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] -// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4 +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I_I]] // int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { return vmlsl_high_n_s32(a, b, c); @@ -3958,15 +3958,15 @@ // CHECK-LABEL: @test_vmlsl_high_n_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] -// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4 +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I_I]] // uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { return vmlsl_high_n_u16(a, b, c); @@ -3975,13 +3975,13 @@ // CHECK-LABEL: @test_vmlsl_high_n_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] -// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] -// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4 +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I_I]] // uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { return vmlsl_high_n_u32(a, b, c); @@ -3990,16 +3990,16 @@ // CHECK-LABEL: @test_vqdmlsl_high_n_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I_I]]) #4 +// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I_I]] // int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { return vqdmlsl_high_n_s16(a, b, c); @@ -4008,14 +4008,14 @@ // CHECK-LABEL: @test_vqdmlsl_high_n_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0 +// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I_I]]) #4 +// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I_I]] // int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { return vqdmlsl_high_n_s32(a, b, c); @@ -4063,7 +4063,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #4 // CHECK-NEXT: ret <2 x float> [[TMP3]] // float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) { @@ -4076,7 +4076,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #4 // CHECK-NEXT: ret <1 x double> [[TMP3]] // float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) { @@ -4092,7 +4092,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #4 // CHECK-NEXT: ret <4 x float> [[TMP3]] // float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { @@ -4107,7 +4107,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #4 // CHECK-NEXT: ret <2 x float> [[TMP3]] // float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) { @@ -4121,7 +4121,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #4 // CHECK-NEXT: ret <1 x double> [[TMP3]] // float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) { @@ -4138,7 +4138,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #4 // CHECK-NEXT: ret <4 x float> [[TMP3]] // float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { @@ -4261,7 +4261,7 @@ // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] // int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) { @@ -4274,7 +4274,7 @@ // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] // int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) { @@ -4289,7 +4289,7 @@ // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] // uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) { @@ -4302,7 +4302,7 @@ // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] // uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) { @@ -4317,7 +4317,7 @@ // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]] // @@ -4331,7 +4331,7 @@ // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]] // @@ -4347,9 +4347,9 @@ // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]] +// CHECK-NEXT: [[VQDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4 +// CHECK-NEXT: [[VQDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I_I]] to <8 x i8> +// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I_I]] // int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) { return vqdmulh_n_s16(a, b); @@ -4367,9 +4367,9 @@ // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]] +// CHECK-NEXT: [[VQDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #4 +// CHECK-NEXT: [[VQDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I_I]] to <16 x i8> +// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I_I]] // int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) { return vqdmulhq_n_s16(a, b); @@ -4381,9 +4381,9 @@ // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]] +// CHECK-NEXT: [[VQDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4 +// CHECK-NEXT: [[VQDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I_I]] to <8 x i8> +// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I_I]] // int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) { return vqdmulh_n_s32(a, b); @@ -4397,9 +4397,9 @@ // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]] +// CHECK-NEXT: [[VQDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #4 +// CHECK-NEXT: [[VQDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I_I]] to <16 x i8> +// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I_I]] // int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) { return vqdmulhq_n_s32(a, b); @@ -4413,9 +4413,9 @@ // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]] +// CHECK-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4 +// CHECK-NEXT: [[VQRDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I_I]] to <8 x i8> +// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I_I]] // int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) { return vqrdmulh_n_s16(a, b); @@ -4433,9 +4433,9 @@ // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// CHECK-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #4 +// CHECK-NEXT: [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I_I]] to <16 x i8> +// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I_I]] // int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) { return vqrdmulhq_n_s16(a, b); @@ -4447,9 +4447,9 @@ // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]] +// CHECK-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4 +// CHECK-NEXT: [[VQRDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I_I]] to <8 x i8> +// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I_I]] // int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) { return vqrdmulh_n_s32(a, b); @@ -4463,9 +4463,9 @@ // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// CHECK-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #4 +// CHECK-NEXT: [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I_I]] to <16 x i8> +// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I_I]] // int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) { return vqrdmulhq_n_s32(a, b); @@ -4595,7 +4595,7 @@ // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] // @@ -4609,7 +4609,7 @@ // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I]] // @@ -4625,7 +4625,7 @@ // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] // @@ -4639,7 +4639,7 @@ // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I]] // @@ -4656,8 +4656,8 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]] // int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { @@ -4671,8 +4671,8 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]] // int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { @@ -4803,7 +4803,7 @@ // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] // @@ -4817,7 +4817,7 @@ // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I]] // @@ -4833,7 +4833,7 @@ // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] // @@ -4847,7 +4847,7 @@ // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I]] // @@ -4864,8 +4864,8 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]] // int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { @@ -4879,8 +4879,8 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]] // int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { @@ -4999,8 +4999,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { @@ -5015,8 +5015,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { @@ -5032,8 +5032,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { @@ -5049,8 +5049,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { @@ -5169,8 +5169,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { @@ -5185,8 +5185,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { @@ -5202,8 +5202,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { @@ -5219,8 +5219,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { @@ -5443,8 +5443,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { @@ -5459,8 +5459,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { @@ -5476,8 +5476,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { @@ -5493,8 +5493,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { @@ -5613,8 +5613,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { @@ -5629,8 +5629,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { @@ -5646,8 +5646,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { @@ -5663,8 +5663,8 @@ // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { Index: clang/test/CodeGen/aarch64-neon-fp16fml.c =================================================================== --- clang/test/CodeGen/aarch64-neon-fp16fml.c +++ clang/test/CodeGen/aarch64-neon-fp16fml.c @@ -15,7 +15,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3:[0-9]+]] +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #3 // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]] // float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -27,7 +27,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #3 // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]] // float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -39,7 +39,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #3 // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]] // float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -51,7 +51,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #3 // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]] // float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -63,7 +63,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #3 // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]] // float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -75,7 +75,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #3 // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]] // float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -87,7 +87,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #3 // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]] // float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -99,7 +99,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #3 // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]] // float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -110,50 +110,50 @@ // CHECK-LABEL: @test_vfmlal_lane_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>* +// CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3 // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]] // float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -162,50 +162,50 @@ // CHECK-LABEL: @test_vfmlal_lane_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>* +// CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3 // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]] // float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -214,90 +214,90 @@ // CHECK-LABEL: @test_vfmlalq_lane_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85134:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85135:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85144:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85145:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85154:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85155:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85164:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85165:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>* +// CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71634:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71635:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71644:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71645:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71654:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71655:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71664:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71665:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85134]], align 8 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_85134]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71634]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_71634]] to <4 x i16>* // CHECK-NEXT: [[TMP17:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 8 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE38]], i16* [[__REINT1_85135]], align 2 -// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_85135]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE38]], i16* [[__REINT1_71635]], align 2 +// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71635]] to half* // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85144]], align 8 -// CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_85144]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71644]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_71644]] to <4 x i16>* // CHECK-NEXT: [[TMP21:%.*]] = load <4 x i16>, <4 x i16>* [[TMP20]], align 8 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE48]], i16* [[__REINT1_85145]], align 2 -// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_85145]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE48]], i16* [[__REINT1_71645]], align 2 +// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71645]] to half* // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85154]], align 8 -// CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_85154]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71654]], align 8 +// CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_71654]] to <4 x i16>* // CHECK-NEXT: [[TMP25:%.*]] = load <4 x i16>, <4 x i16>* [[TMP24]], align 8 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE58]], i16* [[__REINT1_85155]], align 2 -// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_85155]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE58]], i16* [[__REINT1_71655]], align 2 +// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71655]] to half* // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85164]], align 8 -// CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_85164]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71664]], align 8 +// CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_71664]] to <4 x i16>* // CHECK-NEXT: [[TMP29:%.*]] = load <4 x i16>, <4 x i16>* [[TMP28]], align 8 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE68]], i16* [[__REINT1_85165]], align 2 -// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_85165]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE68]], i16* [[__REINT1_71665]], align 2 +// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71665]] to half* // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3 // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]] // float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) { @@ -306,90 +306,90 @@ // CHECK-LABEL: @test_vfmlalq_lane_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85134:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85135:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85144:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85145:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85154:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85155:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85164:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85165:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>* +// CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71634:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71635:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71644:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71645:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71654:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71655:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71664:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71665:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85134]], align 8 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_85134]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71634]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_71634]] to <4 x i16>* // CHECK-NEXT: [[TMP17:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 8 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE38]], i16* [[__REINT1_85135]], align 2 -// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_85135]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE38]], i16* [[__REINT1_71635]], align 2 +// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71635]] to half* // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85144]], align 8 -// CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_85144]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71644]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_71644]] to <4 x i16>* // CHECK-NEXT: [[TMP21:%.*]] = load <4 x i16>, <4 x i16>* [[TMP20]], align 8 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE48]], i16* [[__REINT1_85145]], align 2 -// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_85145]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE48]], i16* [[__REINT1_71645]], align 2 +// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71645]] to half* // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85154]], align 8 -// CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_85154]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71654]], align 8 +// CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_71654]] to <4 x i16>* // CHECK-NEXT: [[TMP25:%.*]] = load <4 x i16>, <4 x i16>* [[TMP24]], align 8 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE58]], i16* [[__REINT1_85155]], align 2 -// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_85155]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE58]], i16* [[__REINT1_71655]], align 2 +// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71655]] to half* // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85164]], align 8 -// CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_85164]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71664]], align 8 +// CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_71664]] to <4 x i16>* // CHECK-NEXT: [[TMP29:%.*]] = load <4 x i16>, <4 x i16>* [[TMP28]], align 8 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE68]], i16* [[__REINT1_85165]], align 2 -// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_85165]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE68]], i16* [[__REINT1_71665]], align 2 +// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71665]] to half* // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3 // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]] // float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) { @@ -398,50 +398,50 @@ // CHECK-LABEL: @test_vfmlal_laneq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>* +// CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3 // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]] // float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) { @@ -450,50 +450,50 @@ // CHECK-LABEL: @test_vfmlal_laneq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>* +// CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3 // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]] // float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) { @@ -502,90 +502,90 @@ // CHECK-LABEL: @test_vfmlalq_laneq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85434:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85435:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85444:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85445:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85454:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85455:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85464:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85465:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>* +// CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71934:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71935:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71944:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71945:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71954:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71955:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71964:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71965:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85434]], align 16 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_85434]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71934]], align 16 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_71934]] to <8 x i16>* // CHECK-NEXT: [[TMP17:%.*]] = load <8 x i16>, <8 x i16>* [[TMP16]], align 16 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE38]], i16* [[__REINT1_85435]], align 2 -// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_85435]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE38]], i16* [[__REINT1_71935]], align 2 +// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71935]] to half* // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85444]], align 16 -// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_85444]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71944]], align 16 +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_71944]] to <8 x i16>* // CHECK-NEXT: [[TMP21:%.*]] = load <8 x i16>, <8 x i16>* [[TMP20]], align 16 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE48]], i16* [[__REINT1_85445]], align 2 -// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_85445]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE48]], i16* [[__REINT1_71945]], align 2 +// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71945]] to half* // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85454]], align 16 -// CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_85454]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71954]], align 16 +// CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_71954]] to <8 x i16>* // CHECK-NEXT: [[TMP25:%.*]] = load <8 x i16>, <8 x i16>* [[TMP24]], align 16 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE58]], i16* [[__REINT1_85455]], align 2 -// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_85455]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE58]], i16* [[__REINT1_71955]], align 2 +// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71955]] to half* // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85464]], align 16 -// CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_85464]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71964]], align 16 +// CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_71964]] to <8 x i16>* // CHECK-NEXT: [[TMP29:%.*]] = load <8 x i16>, <8 x i16>* [[TMP28]], align 16 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE68]], i16* [[__REINT1_85465]], align 2 -// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_85465]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE68]], i16* [[__REINT1_71965]], align 2 +// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71965]] to half* // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3 // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]] // float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -594,90 +594,90 @@ // CHECK-LABEL: @test_vfmlalq_laneq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85434:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85435:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85444:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85445:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85454:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85455:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85464:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85465:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>* +// CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71934:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71935:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71944:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71945:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71954:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71955:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71964:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71965:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85434]], align 16 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_85434]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71934]], align 16 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_71934]] to <8 x i16>* // CHECK-NEXT: [[TMP17:%.*]] = load <8 x i16>, <8 x i16>* [[TMP16]], align 16 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE38]], i16* [[__REINT1_85435]], align 2 -// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_85435]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE38]], i16* [[__REINT1_71935]], align 2 +// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71935]] to half* // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85444]], align 16 -// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_85444]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71944]], align 16 +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_71944]] to <8 x i16>* // CHECK-NEXT: [[TMP21:%.*]] = load <8 x i16>, <8 x i16>* [[TMP20]], align 16 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE48]], i16* [[__REINT1_85445]], align 2 -// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_85445]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE48]], i16* [[__REINT1_71945]], align 2 +// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71945]] to half* // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85454]], align 16 -// CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_85454]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71954]], align 16 +// CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_71954]] to <8 x i16>* // CHECK-NEXT: [[TMP25:%.*]] = load <8 x i16>, <8 x i16>* [[TMP24]], align 16 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE58]], i16* [[__REINT1_85455]], align 2 -// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_85455]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE58]], i16* [[__REINT1_71955]], align 2 +// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71955]] to half* // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85464]], align 16 -// CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_85464]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71964]], align 16 +// CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_71964]] to <8 x i16>* // CHECK-NEXT: [[TMP29:%.*]] = load <8 x i16>, <8 x i16>* [[TMP28]], align 16 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE68]], i16* [[__REINT1_85465]], align 2 -// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_85465]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE68]], i16* [[__REINT1_71965]], align 2 +// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71965]] to half* // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3 // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]] // float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -686,50 +686,50 @@ // CHECK-LABEL: @test_vfmlsl_lane_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>* +// CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3 // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]] // float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -738,50 +738,50 @@ // CHECK-LABEL: @test_vfmlsl_lane_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>* +// CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3 // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]] // float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -790,90 +790,90 @@ // CHECK-LABEL: @test_vfmlslq_lane_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85134:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85135:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85144:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85145:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85154:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85155:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85164:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85165:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>* +// CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71634:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71635:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71644:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71645:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71654:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71655:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71664:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71665:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85134]], align 8 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_85134]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71634]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_71634]] to <4 x i16>* // CHECK-NEXT: [[TMP17:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 8 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE38]], i16* [[__REINT1_85135]], align 2 -// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_85135]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE38]], i16* [[__REINT1_71635]], align 2 +// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71635]] to half* // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85144]], align 8 -// CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_85144]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71644]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_71644]] to <4 x i16>* // CHECK-NEXT: [[TMP21:%.*]] = load <4 x i16>, <4 x i16>* [[TMP20]], align 8 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE48]], i16* [[__REINT1_85145]], align 2 -// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_85145]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE48]], i16* [[__REINT1_71645]], align 2 +// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71645]] to half* // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85154]], align 8 -// CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_85154]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71654]], align 8 +// CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_71654]] to <4 x i16>* // CHECK-NEXT: [[TMP25:%.*]] = load <4 x i16>, <4 x i16>* [[TMP24]], align 8 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE58]], i16* [[__REINT1_85155]], align 2 -// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_85155]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE58]], i16* [[__REINT1_71655]], align 2 +// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71655]] to half* // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85164]], align 8 -// CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_85164]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71664]], align 8 +// CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_71664]] to <4 x i16>* // CHECK-NEXT: [[TMP29:%.*]] = load <4 x i16>, <4 x i16>* [[TMP28]], align 8 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE68]], i16* [[__REINT1_85165]], align 2 -// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_85165]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE68]], i16* [[__REINT1_71665]], align 2 +// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71665]] to half* // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3 // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]] // float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) { @@ -882,90 +882,90 @@ // CHECK-LABEL: @test_vfmlslq_lane_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85134:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85135:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85144:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85145:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85154:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85155:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85164:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_85165:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>* +// CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71634:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71635:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71644:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71645:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71654:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71655:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71664:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_71665:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85134]], align 8 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_85134]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71634]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_71634]] to <4 x i16>* // CHECK-NEXT: [[TMP17:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 8 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE38]], i16* [[__REINT1_85135]], align 2 -// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_85135]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE38]], i16* [[__REINT1_71635]], align 2 +// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71635]] to half* // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85144]], align 8 -// CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_85144]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71644]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_71644]] to <4 x i16>* // CHECK-NEXT: [[TMP21:%.*]] = load <4 x i16>, <4 x i16>* [[TMP20]], align 8 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE48]], i16* [[__REINT1_85145]], align 2 -// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_85145]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE48]], i16* [[__REINT1_71645]], align 2 +// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71645]] to half* // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85154]], align 8 -// CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_85154]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71654]], align 8 +// CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_71654]] to <4 x i16>* // CHECK-NEXT: [[TMP25:%.*]] = load <4 x i16>, <4 x i16>* [[TMP24]], align 8 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE58]], i16* [[__REINT1_85155]], align 2 -// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_85155]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE58]], i16* [[__REINT1_71655]], align 2 +// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71655]] to half* // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6 -// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_85164]], align 8 -// CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_85164]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71664]], align 8 +// CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_71664]] to <4 x i16>* // CHECK-NEXT: [[TMP29:%.*]] = load <4 x i16>, <4 x i16>* [[TMP28]], align 8 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE68]], i16* [[__REINT1_85165]], align 2 -// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_85165]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE68]], i16* [[__REINT1_71665]], align 2 +// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71665]] to half* // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3 // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]] // float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) { @@ -974,50 +974,50 @@ // CHECK-LABEL: @test_vfmlsl_laneq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>* +// CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3 // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]] // float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) { @@ -1026,50 +1026,50 @@ // CHECK-LABEL: @test_vfmlsl_laneq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>* +// CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3 // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]] // float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) { @@ -1078,90 +1078,90 @@ // CHECK-LABEL: @test_vfmlslq_laneq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85434:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85435:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85444:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85445:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85454:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85455:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85464:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85465:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>* +// CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71934:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71935:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71944:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71945:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71954:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71955:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71964:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71965:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85434]], align 16 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_85434]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71934]], align 16 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_71934]] to <8 x i16>* // CHECK-NEXT: [[TMP17:%.*]] = load <8 x i16>, <8 x i16>* [[TMP16]], align 16 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE38]], i16* [[__REINT1_85435]], align 2 -// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_85435]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE38]], i16* [[__REINT1_71935]], align 2 +// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71935]] to half* // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85444]], align 16 -// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_85444]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71944]], align 16 +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_71944]] to <8 x i16>* // CHECK-NEXT: [[TMP21:%.*]] = load <8 x i16>, <8 x i16>* [[TMP20]], align 16 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE48]], i16* [[__REINT1_85445]], align 2 -// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_85445]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE48]], i16* [[__REINT1_71945]], align 2 +// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71945]] to half* // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85454]], align 16 -// CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_85454]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71954]], align 16 +// CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_71954]] to <8 x i16>* // CHECK-NEXT: [[TMP25:%.*]] = load <8 x i16>, <8 x i16>* [[TMP24]], align 16 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE58]], i16* [[__REINT1_85455]], align 2 -// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_85455]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE58]], i16* [[__REINT1_71955]], align 2 +// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71955]] to half* // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85464]], align 16 -// CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_85464]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71964]], align 16 +// CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_71964]] to <8 x i16>* // CHECK-NEXT: [[TMP29:%.*]] = load <8 x i16>, <8 x i16>* [[TMP28]], align 16 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE68]], i16* [[__REINT1_85465]], align 2 -// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_85465]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE68]], i16* [[__REINT1_71965]], align 2 +// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71965]] to half* // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3 // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]] // float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -1170,90 +1170,90 @@ // CHECK-LABEL: @test_vfmlslq_laneq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85434:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85435:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85444:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85445:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85454:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85455:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85464:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85465:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>* +// CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71934:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71935:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71944:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71945:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71954:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71955:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_71964:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_71965:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>* // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2 -// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half* // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>* // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half* // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>* // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2 -// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half* // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85434]], align 16 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_85434]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71934]], align 16 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_71934]] to <8 x i16>* // CHECK-NEXT: [[TMP17:%.*]] = load <8 x i16>, <8 x i16>* [[TMP16]], align 16 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE38]], i16* [[__REINT1_85435]], align 2 -// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_85435]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE38]], i16* [[__REINT1_71935]], align 2 +// CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71935]] to half* // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85444]], align 16 -// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_85444]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71944]], align 16 +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_71944]] to <8 x i16>* // CHECK-NEXT: [[TMP21:%.*]] = load <8 x i16>, <8 x i16>* [[TMP20]], align 16 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE48]], i16* [[__REINT1_85445]], align 2 -// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_85445]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE48]], i16* [[__REINT1_71945]], align 2 +// CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71945]] to half* // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85454]], align 16 -// CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_85454]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71954]], align 16 +// CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_71954]] to <8 x i16>* // CHECK-NEXT: [[TMP25:%.*]] = load <8 x i16>, <8 x i16>* [[TMP24]], align 16 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE58]], i16* [[__REINT1_85455]], align 2 -// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_85455]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE58]], i16* [[__REINT1_71955]], align 2 +// CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71955]] to half* // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6 -// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_85464]], align 16 -// CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_85464]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71964]], align 16 +// CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_71964]] to <8 x i16>* // CHECK-NEXT: [[TMP29:%.*]] = load <8 x i16>, <8 x i16>* [[TMP28]], align 16 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE68]], i16* [[__REINT1_85465]], align 2 -// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_85465]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE68]], i16* [[__REINT1_71965]], align 2 +// CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71965]] to half* // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]] +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3 // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]] // float32x4_t test_vfmlslq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { Index: clang/test/CodeGen/arm_acle.c =================================================================== --- clang/test/CodeGen/arm_acle.c +++ clang/test/CodeGen/arm_acle.c @@ -56,12 +56,12 @@ /* 8.4 Hints */ // AArch32-LABEL: @test_yield( // AArch32-NEXT: entry: -// AArch32-NEXT: call void @llvm.arm.hint(i32 1) #[[ATTR1:[0-9]+]] +// AArch32-NEXT: call void @llvm.arm.hint(i32 1) [[ATTR1:#.*]] // AArch32-NEXT: ret void // // AArch64-LABEL: @test_yield( // AArch64-NEXT: entry: -// AArch64-NEXT: call void @llvm.aarch64.hint(i32 1) #[[ATTR3:[0-9]+]] +// AArch64-NEXT: call void @llvm.aarch64.hint(i32 1) [[ATTR3:#.*]] // AArch64-NEXT: ret void // void test_yield(void) { @@ -70,12 +70,12 @@ // AArch32-LABEL: @test_wfe( // AArch32-NEXT: entry: -// AArch32-NEXT: call void @llvm.arm.hint(i32 2) #[[ATTR1]] +// AArch32-NEXT: call void @llvm.arm.hint(i32 2) [[ATTR1]] // AArch32-NEXT: ret void // // AArch64-LABEL: @test_wfe( // AArch64-NEXT: entry: -// AArch64-NEXT: call void @llvm.aarch64.hint(i32 2) #[[ATTR3]] +// AArch64-NEXT: call void @llvm.aarch64.hint(i32 2) [[ATTR3]] // AArch64-NEXT: ret void // void test_wfe(void) { @@ -84,12 +84,12 @@ // AArch32-LABEL: @test_wfi( // AArch32-NEXT: entry: -// AArch32-NEXT: call void @llvm.arm.hint(i32 3) #[[ATTR1]] +// AArch32-NEXT: call void @llvm.arm.hint(i32 3) [[ATTR1]] // AArch32-NEXT: ret void // // AArch64-LABEL: @test_wfi( // AArch64-NEXT: entry: -// AArch64-NEXT: call void @llvm.aarch64.hint(i32 3) #[[ATTR3]] +// AArch64-NEXT: call void @llvm.aarch64.hint(i32 3) [[ATTR3]] // AArch64-NEXT: ret void // void test_wfi(void) { @@ -98,12 +98,12 @@ // AArch32-LABEL: @test_sev( // AArch32-NEXT: entry: -// AArch32-NEXT: call void @llvm.arm.hint(i32 4) #[[ATTR1]] +// AArch32-NEXT: call void @llvm.arm.hint(i32 4) [[ATTR1]] // AArch32-NEXT: ret void // // AArch64-LABEL: @test_sev( // AArch64-NEXT: entry: -// AArch64-NEXT: call void @llvm.aarch64.hint(i32 4) #[[ATTR3]] +// AArch64-NEXT: call void @llvm.aarch64.hint(i32 4) [[ATTR3]] // AArch64-NEXT: ret void // void test_sev(void) { @@ -112,12 +112,12 @@ // AArch32-LABEL: @test_sevl( // AArch32-NEXT: entry: -// AArch32-NEXT: call void @llvm.arm.hint(i32 5) #[[ATTR1]] +// AArch32-NEXT: call void @llvm.arm.hint(i32 5) [[ATTR1]] // AArch32-NEXT: ret void // // AArch64-LABEL: @test_sevl( // AArch64-NEXT: entry: -// AArch64-NEXT: call void @llvm.aarch64.hint(i32 5) #[[ATTR3]] +// AArch64-NEXT: call void @llvm.aarch64.hint(i32 5) [[ATTR3]] // AArch64-NEXT: ret void // void test_sevl(void) { @@ -141,10 +141,10 @@ // AArch32-NEXT: [[TMP0:%.*]] = bitcast i8* [[P:%.*]] to i32* // AArch32-NEXT: br label [[DO_BODY_I:%.*]] // AArch32: do.body.i: -// AArch32-NEXT: [[LDREX_I:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* elementtype(i32) [[TMP0]]) #[[ATTR1]] -// AArch32-NEXT: [[STREX_I:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 [[X:%.*]], i32* elementtype(i32) [[TMP0]]) #[[ATTR1]] +// AArch32-NEXT: [[LDREX_I:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* elementtype(i32) [[TMP0]]) [[ATTR1]] +// AArch32-NEXT: [[STREX_I:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 [[X:%.*]], i32* elementtype(i32) [[TMP0]]) [[ATTR1]] // AArch32-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[STREX_I]], 0 -// AArch32-NEXT: br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +// AArch32-NEXT: br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], [[LOOP3:!llvm.loop !.*]] // AArch32: __swp.exit: // AArch32-NEXT: ret void // @@ -153,12 +153,12 @@ // AArch64-NEXT: [[TMP0:%.*]] = bitcast i8* [[P:%.*]] to i32* // AArch64-NEXT: br label [[DO_BODY_I:%.*]] // AArch64: do.body.i: -// AArch64-NEXT: [[LDXR_I:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* elementtype(i32) [[TMP0]]) #[[ATTR3]] +// AArch64-NEXT: [[LDXR_I:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* elementtype(i32) [[TMP0]]) [[ATTR3]] // AArch64-NEXT: [[TMP1:%.*]] = trunc i64 [[LDXR_I]] to i32 // AArch64-NEXT: [[TMP2:%.*]] = zext i32 [[X:%.*]] to i64 -// AArch64-NEXT: [[STXR_I:%.*]] = call i32 @llvm.aarch64.stxr.p0i32(i64 [[TMP2]], i32* elementtype(i32) [[TMP0]]) #[[ATTR3]] +// AArch64-NEXT: [[STXR_I:%.*]] = call i32 @llvm.aarch64.stxr.p0i32(i64 [[TMP2]], i32* elementtype(i32) [[TMP0]]) [[ATTR3]] // AArch64-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[STXR_I]], 0 -// AArch64-NEXT: br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]] +// AArch64-NEXT: br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], [[LOOP6:!llvm.loop !.*]] // AArch64: __swp.exit: // AArch64-NEXT: ret void // @@ -218,12 +218,12 @@ /* 8.7 NOP */ // AArch32-LABEL: @test_nop( // AArch32-NEXT: entry: -// AArch32-NEXT: call void @llvm.arm.hint(i32 0) #[[ATTR1]] +// AArch32-NEXT: call void @llvm.arm.hint(i32 0) [[ATTR1]] // AArch32-NEXT: ret void // // AArch64-LABEL: @test_nop( // AArch64-NEXT: entry: -// AArch64-NEXT: call void @llvm.aarch64.hint(i32 0) #[[ATTR3]] +// AArch64-NEXT: call void @llvm.aarch64.hint(i32 0) [[ATTR3]] // AArch64-NEXT: ret void // void test_nop(void) { @@ -319,12 +319,12 @@ // AArch32-LABEL: @test_clz( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // // AArch64-LABEL: @test_clz( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) [[ATTR3]] // AArch64-NEXT: ret i32 [[TMP0]] // uint32_t test_clz(uint32_t t) { @@ -333,12 +333,12 @@ // AArch32-LABEL: @test_clzl( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // // AArch64-LABEL: @test_clzl( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) [[ATTR3]] // AArch64-NEXT: [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32 // AArch64-NEXT: [[CONV_I:%.*]] = sext i32 [[CAST_I]] to i64 // AArch64-NEXT: ret i64 [[CONV_I]] @@ -349,14 +349,14 @@ // AArch32-LABEL: @test_clzll( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) [[ATTR1]] // AArch32-NEXT: [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32 // AArch32-NEXT: [[CONV_I:%.*]] = sext i32 [[CAST_I]] to i64 // AArch32-NEXT: ret i64 [[CONV_I]] // // AArch64-LABEL: @test_clzll( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) [[ATTR3]] // AArch64-NEXT: [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32 // AArch64-NEXT: [[CONV_I:%.*]] = sext i32 [[CAST_I]] to i64 // AArch64-NEXT: ret i64 [[CONV_I]] @@ -367,12 +367,12 @@ // AArch32-LABEL: @test_cls( // AArch32-NEXT: entry: -// AArch32-NEXT: [[CLS_I:%.*]] = call i32 @llvm.arm.cls(i32 [[T:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[CLS_I:%.*]] = call i32 @llvm.arm.cls(i32 [[T:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[CLS_I]] // // AArch64-LABEL: @test_cls( // AArch64-NEXT: entry: -// AArch64-NEXT: [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls(i32 [[T:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls(i32 [[T:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i32 [[CLS_I]] // unsigned test_cls(uint32_t t) { @@ -381,12 +381,12 @@ // AArch32-LABEL: @test_clsl( // AArch32-NEXT: entry: -// AArch32-NEXT: [[CLS_I:%.*]] = call i32 @llvm.arm.cls(i32 [[T:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[CLS_I:%.*]] = call i32 @llvm.arm.cls(i32 [[T:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[CLS_I]] // // AArch64-LABEL: @test_clsl( // AArch64-NEXT: entry: -// AArch64-NEXT: [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls64(i64 [[T:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls64(i64 [[T:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i32 [[CLS_I]] // unsigned test_clsl(unsigned long t) { @@ -395,12 +395,12 @@ // AArch32-LABEL: @test_clsll( // AArch32-NEXT: entry: -// AArch32-NEXT: [[CLS_I:%.*]] = call i32 @llvm.arm.cls64(i64 [[T:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[CLS_I:%.*]] = call i32 @llvm.arm.cls64(i64 [[T:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[CLS_I]] // // AArch64-LABEL: @test_clsll( // AArch64-NEXT: entry: -// AArch64-NEXT: [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls64(i64 [[T:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls64(i64 [[T:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i32 [[CLS_I]] // unsigned test_clsll(uint64_t t) { @@ -409,12 +409,12 @@ // AArch32-LABEL: @test_rev( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // // AArch64-LABEL: @test_rev( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i32 [[TMP0]] // uint32_t test_rev(uint32_t t) { @@ -423,12 +423,12 @@ // AArch32-LABEL: @test_revl( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // // AArch64-LABEL: @test_revl( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i64 [[TMP0]] // long test_revl(long t) { @@ -437,12 +437,12 @@ // AArch32-LABEL: @test_revll( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i64 [[TMP0]] // // AArch64-LABEL: @test_revll( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i64 [[TMP0]] // uint64_t test_revll(uint64_t t) { @@ -451,7 +451,7 @@ // AArch32-LABEL: @test_rev16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR1]] // AArch32-NEXT: [[REM_I_I:%.*]] = urem i32 16, 32 // AArch32-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[REM_I_I]], 0 // AArch32-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_END_I_I:%.*]] @@ -469,7 +469,7 @@ // // AArch64-LABEL: @test_rev16( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR3]] // AArch64-NEXT: [[REM_I_I:%.*]] = urem i32 16, 32 // AArch64-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[REM_I_I]], 0 // AArch64-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_END_I_I:%.*]] @@ -491,7 +491,7 @@ // AArch32-LABEL: @test_rev16l( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR1]] // AArch32-NEXT: [[REM_I_I_I:%.*]] = urem i32 16, 32 // AArch32-NEXT: [[CMP_I_I_I:%.*]] = icmp eq i32 [[REM_I_I_I]], 0 // AArch32-NEXT: br i1 [[CMP_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[IF_END_I_I_I:%.*]] @@ -511,7 +511,7 @@ // AArch64-NEXT: entry: // AArch64-NEXT: [[SHR_I:%.*]] = lshr i64 [[T:%.*]], 32 // AArch64-NEXT: [[CONV_I:%.*]] = trunc i64 [[SHR_I]] to i32 -// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) [[ATTR3]] // AArch64-NEXT: [[REM_I_I10_I:%.*]] = urem i32 16, 32 // AArch64-NEXT: [[CMP_I_I11_I:%.*]] = icmp eq i32 [[REM_I_I10_I]], 0 // AArch64-NEXT: br i1 [[CMP_I_I11_I]], label [[IF_THEN_I_I12_I:%.*]], label [[IF_END_I_I17_I:%.*]] @@ -528,7 +528,7 @@ // AArch64-NEXT: [[CONV1_I:%.*]] = zext i32 [[RETVAL_I_I6_I_0]] to i64 // AArch64-NEXT: [[SHL_I:%.*]] = shl i64 [[CONV1_I]], 32 // AArch64-NEXT: [[CONV2_I:%.*]] = trunc i64 [[T]] to i32 -// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) [[ATTR3]] // AArch64-NEXT: [[REM_I_I_I:%.*]] = urem i32 16, 32 // AArch64-NEXT: [[CMP_I_I_I:%.*]] = icmp eq i32 [[REM_I_I_I]], 0 // AArch64-NEXT: br i1 [[CMP_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[IF_END_I_I_I:%.*]] @@ -554,7 +554,7 @@ // AArch32-NEXT: entry: // AArch32-NEXT: [[SHR_I:%.*]] = lshr i64 [[T:%.*]], 32 // AArch32-NEXT: [[CONV_I:%.*]] = trunc i64 [[SHR_I]] to i32 -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) [[ATTR1]] // AArch32-NEXT: [[REM_I_I10_I:%.*]] = urem i32 16, 32 // AArch32-NEXT: [[CMP_I_I11_I:%.*]] = icmp eq i32 [[REM_I_I10_I]], 0 // AArch32-NEXT: br i1 [[CMP_I_I11_I]], label [[IF_THEN_I_I12_I:%.*]], label [[IF_END_I_I17_I:%.*]] @@ -571,7 +571,7 @@ // AArch32-NEXT: [[CONV1_I:%.*]] = zext i32 [[RETVAL_I_I6_I_0]] to i64 // AArch32-NEXT: [[SHL_I:%.*]] = shl i64 [[CONV1_I]], 32 // AArch32-NEXT: [[CONV2_I:%.*]] = trunc i64 [[T]] to i32 -// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) [[ATTR1]] // AArch32-NEXT: [[REM_I_I_I:%.*]] = urem i32 16, 32 // AArch32-NEXT: [[CMP_I_I_I:%.*]] = icmp eq i32 [[REM_I_I_I]], 0 // AArch32-NEXT: br i1 [[CMP_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[IF_END_I_I_I:%.*]] @@ -593,7 +593,7 @@ // AArch64-NEXT: entry: // AArch64-NEXT: [[SHR_I:%.*]] = lshr i64 [[T:%.*]], 32 // AArch64-NEXT: [[CONV_I:%.*]] = trunc i64 [[SHR_I]] to i32 -// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) [[ATTR3]] // AArch64-NEXT: [[REM_I_I10_I:%.*]] = urem i32 16, 32 // AArch64-NEXT: [[CMP_I_I11_I:%.*]] = icmp eq i32 [[REM_I_I10_I]], 0 // AArch64-NEXT: br i1 [[CMP_I_I11_I]], label [[IF_THEN_I_I12_I:%.*]], label [[IF_END_I_I17_I:%.*]] @@ -610,7 +610,7 @@ // AArch64-NEXT: [[CONV1_I:%.*]] = zext i32 [[RETVAL_I_I6_I_0]] to i64 // AArch64-NEXT: [[SHL_I:%.*]] = shl i64 [[CONV1_I]], 32 // AArch64-NEXT: [[CONV2_I:%.*]] = trunc i64 [[T]] to i32 -// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) [[ATTR3]] // AArch64-NEXT: [[REM_I_I_I:%.*]] = urem i32 16, 32 // AArch64-NEXT: [[CMP_I_I_I:%.*]] = icmp eq i32 [[REM_I_I_I]], 0 // AArch64-NEXT: br i1 [[CMP_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[IF_END_I_I_I:%.*]] @@ -634,12 +634,12 @@ // AArch32-LABEL: @test_revsh( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i16 @llvm.bswap.i16(i16 [[T:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i16 @llvm.bswap.i16(i16 [[T:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i16 [[TMP0]] // // AArch64-LABEL: @test_revsh( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i16 @llvm.bswap.i16(i16 [[T:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i16 @llvm.bswap.i16(i16 [[T:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i16 [[TMP0]] // int16_t test_revsh(int16_t t) { @@ -648,12 +648,12 @@ // AArch32-LABEL: @test_rbit( // AArch32-NEXT: entry: -// AArch32-NEXT: [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[RBIT_I]] // // AArch64-LABEL: @test_rbit( // AArch64-NEXT: entry: -// AArch64-NEXT: [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i32 [[RBIT_I]] // uint32_t test_rbit(uint32_t t) { @@ -662,12 +662,12 @@ // AArch32-LABEL: @test_rbitl( // AArch32-NEXT: entry: -// AArch32-NEXT: [[RBIT_I_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[RBIT_I_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[RBIT_I_I]] // // AArch64-LABEL: @test_rbitl( // AArch64-NEXT: entry: -// AArch64-NEXT: [[RBIT_I:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[T:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[RBIT_I:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[T:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i64 [[RBIT_I]] // long test_rbitl(long t) { @@ -677,19 +677,19 @@ // AArch32-LABEL: @test_rbitll( // AArch32-NEXT: entry: // AArch32-NEXT: [[CONV_I:%.*]] = trunc i64 [[T:%.*]] to i32 -// AArch32-NEXT: [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[CONV_I]]) #[[ATTR1]] +// AArch32-NEXT: [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[CONV_I]]) [[ATTR1]] // AArch32-NEXT: [[CONV1_I:%.*]] = zext i32 [[RBIT_I]] to i64 // AArch32-NEXT: [[SHL_I:%.*]] = shl i64 [[CONV1_I]], 32 // AArch32-NEXT: [[SHR_I:%.*]] = lshr i64 [[T]], 32 // AArch32-NEXT: [[CONV2_I:%.*]] = trunc i64 [[SHR_I]] to i32 -// AArch32-NEXT: [[RBIT3_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[CONV2_I]]) #[[ATTR1]] +// AArch32-NEXT: [[RBIT3_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[CONV2_I]]) [[ATTR1]] // AArch32-NEXT: [[CONV4_I:%.*]] = zext i32 [[RBIT3_I]] to i64 // AArch32-NEXT: [[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV4_I]] // AArch32-NEXT: ret i64 [[OR_I]] // // AArch64-LABEL: @test_rbitll( // AArch64-NEXT: entry: -// AArch64-NEXT: [[RBIT_I:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[T:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[RBIT_I:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[T:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i64 [[RBIT_I]] // uint64_t test_rbitll(uint64_t t) { @@ -722,7 +722,7 @@ #ifdef __ARM_FEATURE_DSP // AArch32-LABEL: @test_qadd( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_qadd(int32_t a, int32_t b) { @@ -731,7 +731,7 @@ // AArch32-LABEL: @test_qsub( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsub(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsub(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_qsub(int32_t a, int32_t b) { @@ -741,8 +741,8 @@ extern int32_t f(); // AArch32-LABEL: @test_qdbl( // AArch32-NEXT: entry: -// AArch32-NEXT: [[CALL:%.*]] = call i32 bitcast (i32 (...)* @f to i32 ()*)() #[[ATTR7:[0-9]+]] -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[CALL]], i32 [[CALL]]) #[[ATTR1]] +// AArch32-NEXT: [[CALL:%.*]] = call i32 bitcast (i32 (...)* @f to i32 ()*)() [[ATTR7:#.*]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[CALL]], i32 [[CALL]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_qdbl() { @@ -756,7 +756,7 @@ #if __ARM_FEATURE_DSP // AArch32-LABEL: @test_smulbb( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulbb(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulbb(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smulbb(int32_t a, int32_t b) { @@ -765,7 +765,7 @@ // AArch32-LABEL: @test_smulbt( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulbt(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulbt(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smulbt(int32_t a, int32_t b) { @@ -774,7 +774,7 @@ // AArch32-LABEL: @test_smultb( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smultb(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smultb(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smultb(int32_t a, int32_t b) { @@ -783,7 +783,7 @@ // AArch32-LABEL: @test_smultt( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smultt(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smultt(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smultt(int32_t a, int32_t b) { @@ -792,7 +792,7 @@ // AArch32-LABEL: @test_smulwb( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulwb(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulwb(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smulwb(int32_t a, int32_t b) { @@ -801,7 +801,7 @@ // AArch32-LABEL: @test_smulwt( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulwt(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulwt(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smulwt(int32_t a, int32_t b) { @@ -813,7 +813,7 @@ #if __ARM_FEATURE_DSP // AArch32-LABEL: @test_smlabb( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlabb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlabb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smlabb(int32_t a, int32_t b, int32_t c) { @@ -822,7 +822,7 @@ // AArch32-LABEL: @test_smlabt( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlabt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlabt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smlabt(int32_t a, int32_t b, int32_t c) { @@ -831,7 +831,7 @@ // AArch32-LABEL: @test_smlatb( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlatb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlatb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smlatb(int32_t a, int32_t b, int32_t c) { @@ -840,7 +840,7 @@ // AArch32-LABEL: @test_smlatt( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlatt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlatt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smlatt(int32_t a, int32_t b, int32_t c) { @@ -849,7 +849,7 @@ // AArch32-LABEL: @test_smlawb( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlawb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlawb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smlawb(int32_t a, int32_t b, int32_t c) { @@ -858,7 +858,7 @@ // AArch32-LABEL: @test_smlawt( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlawt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlawt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smlawt(int32_t a, int32_t b, int32_t c) { @@ -891,7 +891,7 @@ #if __ARM_FEATURE_SIMD32 // AArch32-LABEL: @test_sxtab16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sxtab16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sxtab16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_sxtab16(int16x2_t a, int8x4_t b) { @@ -900,7 +900,7 @@ // AArch32-LABEL: @test_sxtb16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sxtb16(i32 [[A:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sxtb16(i32 [[A:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_sxtb16(int8x4_t a) { @@ -909,7 +909,7 @@ // AArch32-LABEL: @test_uxtab16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uxtab16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uxtab16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_uxtab16(int16x2_t a, int8x4_t b) { @@ -918,7 +918,7 @@ // AArch32-LABEL: @test_uxtb16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uxtb16(i32 [[A:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uxtb16(i32 [[A:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_uxtb16(int8x4_t a) { @@ -930,7 +930,7 @@ #if __ARM_FEATURE_SIMD32 // AArch32-LABEL: @test_sel( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sel(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sel(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint8x4_t test_sel(uint8x4_t a, uint8x4_t b) { @@ -942,7 +942,7 @@ #if __ARM_FEATURE_SIMD32 // AArch32-LABEL: @test_qadd8( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_qadd8(int8x4_t a, int8x4_t b) { @@ -951,7 +951,7 @@ // AArch32-LABEL: @test_qsub8( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsub8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int8x4_t test_qsub8(int8x4_t a, int8x4_t b) { @@ -960,7 +960,7 @@ // AArch32-LABEL: @test_sadd8( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sadd8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int8x4_t test_sadd8(int8x4_t a, int8x4_t b) { @@ -969,7 +969,7 @@ // AArch32-LABEL: @test_shadd8( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shadd8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int8x4_t test_shadd8(int8x4_t a, int8x4_t b) { @@ -978,7 +978,7 @@ // AArch32-LABEL: @test_shsub8( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shsub8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shsub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int8x4_t test_shsub8(int8x4_t a, int8x4_t b) { @@ -987,7 +987,7 @@ // AArch32-LABEL: @test_ssub8( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.ssub8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.ssub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int8x4_t test_ssub8(int8x4_t a, int8x4_t b) { @@ -996,7 +996,7 @@ // AArch32-LABEL: @test_uadd8( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uadd8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint8x4_t test_uadd8(uint8x4_t a, uint8x4_t b) { @@ -1005,7 +1005,7 @@ // AArch32-LABEL: @test_uhadd8( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhadd8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint8x4_t test_uhadd8(uint8x4_t a, uint8x4_t b) { @@ -1014,7 +1014,7 @@ // AArch32-LABEL: @test_uhsub8( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhsub8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhsub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint8x4_t test_uhsub8(uint8x4_t a, uint8x4_t b) { @@ -1023,7 +1023,7 @@ // AArch32-LABEL: @test_uqadd8( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqadd8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint8x4_t test_uqadd8(uint8x4_t a, uint8x4_t b) { @@ -1032,7 +1032,7 @@ // AArch32-LABEL: @test_uqsub8( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqsub8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqsub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint8x4_t test_uqsub8(uint8x4_t a, uint8x4_t b) { @@ -1041,7 +1041,7 @@ // AArch32-LABEL: @test_usub8( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usub8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint8x4_t test_usub8(uint8x4_t a, uint8x4_t b) { @@ -1053,7 +1053,7 @@ #if __ARM_FEATURE_SIMD32 // AArch32-LABEL: @test_usad8( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usad8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usad8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint32_t test_usad8(uint8x4_t a, uint8x4_t b) { @@ -1065,7 +1065,7 @@ // AArch32-NEXT: [[CONV:%.*]] = zext i8 [[A:%.*]] to i32 // AArch32-NEXT: [[CONV1:%.*]] = zext i8 [[B:%.*]] to i32 // AArch32-NEXT: [[CONV2:%.*]] = zext i8 [[C:%.*]] to i32 -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usada8(i32 [[CONV]], i32 [[CONV1]], i32 [[CONV2]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usada8(i32 [[CONV]], i32 [[CONV1]], i32 [[CONV2]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint32_t test_usada8(uint8_t a, uint8_t b, uint8_t c) { @@ -1077,7 +1077,7 @@ #if __ARM_FEATURE_SIMD32 // AArch32-LABEL: @test_qadd16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_qadd16(int16x2_t a, int16x2_t b) { @@ -1086,7 +1086,7 @@ // AArch32-LABEL: @test_qasx( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qasx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_qasx(int16x2_t a, int16x2_t b) { @@ -1095,7 +1095,7 @@ // AArch32-LABEL: @test_qsax( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsax(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_qsax(int16x2_t a, int16x2_t b) { @@ -1104,7 +1104,7 @@ // AArch32-LABEL: @test_qsub16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsub16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_qsub16(int16x2_t a, int16x2_t b) { @@ -1113,7 +1113,7 @@ // AArch32-LABEL: @test_sadd16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sadd16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_sadd16(int16x2_t a, int16x2_t b) { @@ -1122,7 +1122,7 @@ // AArch32-LABEL: @test_sasx( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sasx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_sasx(int16x2_t a, int16x2_t b) { @@ -1131,7 +1131,7 @@ // AArch32-LABEL: @test_shadd16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shadd16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_shadd16(int16x2_t a, int16x2_t b) { @@ -1140,7 +1140,7 @@ // AArch32-LABEL: @test_shasx( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shasx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_shasx(int16x2_t a, int16x2_t b) { @@ -1149,7 +1149,7 @@ // AArch32-LABEL: @test_shsax( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shsax(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shsax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_shsax(int16x2_t a, int16x2_t b) { @@ -1158,7 +1158,7 @@ // AArch32-LABEL: @test_shsub16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shsub16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shsub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_shsub16(int16x2_t a, int16x2_t b) { @@ -1167,7 +1167,7 @@ // AArch32-LABEL: @test_ssax( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.ssax(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.ssax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_ssax(int16x2_t a, int16x2_t b) { @@ -1176,7 +1176,7 @@ // AArch32-LABEL: @test_ssub16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.ssub16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.ssub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int16x2_t test_ssub16(int16x2_t a, int16x2_t b) { @@ -1185,7 +1185,7 @@ // AArch32-LABEL: @test_uadd16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uadd16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint16x2_t test_uadd16(uint16x2_t a, uint16x2_t b) { @@ -1194,7 +1194,7 @@ // AArch32-LABEL: @test_uasx( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uasx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint16x2_t test_uasx(uint16x2_t a, uint16x2_t b) { @@ -1203,7 +1203,7 @@ // AArch32-LABEL: @test_uhadd16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhadd16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint16x2_t test_uhadd16(uint16x2_t a, uint16x2_t b) { @@ -1212,7 +1212,7 @@ // AArch32-LABEL: @test_uhasx( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhasx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint16x2_t test_uhasx(uint16x2_t a, uint16x2_t b) { @@ -1221,7 +1221,7 @@ // AArch32-LABEL: @test_uhsax( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhsax(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhsax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint16x2_t test_uhsax(uint16x2_t a, uint16x2_t b) { @@ -1230,7 +1230,7 @@ // AArch32-LABEL: @test_uhsub16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhsub16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhsub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint16x2_t test_uhsub16(uint16x2_t a, uint16x2_t b) { @@ -1239,7 +1239,7 @@ // AArch32-LABEL: @test_uqadd16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqadd16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint16x2_t test_uqadd16(uint16x2_t a, uint16x2_t b) { @@ -1248,7 +1248,7 @@ // AArch32-LABEL: @test_uqasx( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqasx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint16x2_t test_uqasx(uint16x2_t a, uint16x2_t b) { @@ -1257,7 +1257,7 @@ // AArch32-LABEL: @test_uqsax( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqsax(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqsax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint16x2_t test_uqsax(uint16x2_t a, uint16x2_t b) { @@ -1266,7 +1266,7 @@ // AArch32-LABEL: @test_uqsub16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqsub16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqsub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint16x2_t test_uqsub16(uint16x2_t a, uint16x2_t b) { @@ -1275,7 +1275,7 @@ // AArch32-LABEL: @test_usax( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usax(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint16x2_t test_usax(uint16x2_t a, uint16x2_t b) { @@ -1284,7 +1284,7 @@ // AArch32-LABEL: @test_usub16( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usub16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // uint16x2_t test_usub16(uint16x2_t a, uint16x2_t b) { @@ -1296,7 +1296,7 @@ #if __ARM_FEATURE_SIMD32 // AArch32-LABEL: @test_smlad( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlad(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlad(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smlad(int16x2_t a, int16x2_t b, int32_t c) { @@ -1305,7 +1305,7 @@ // AArch32-LABEL: @test_smladx( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smladx(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smladx(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smladx(int16x2_t a, int16x2_t b, int32_t c) { @@ -1314,7 +1314,7 @@ // AArch32-LABEL: @test_smlald( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlald(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlald(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i64 [[TMP0]] // int64_t test_smlald(int16x2_t a, int16x2_t b, int64_t c) { @@ -1323,7 +1323,7 @@ // AArch32-LABEL: @test_smlaldx( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i64 [[TMP0]] // int64_t test_smlaldx(int16x2_t a, int16x2_t b, int64_t c) { @@ -1332,7 +1332,7 @@ // AArch32-LABEL: @test_smlsd( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlsd(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlsd(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smlsd(int16x2_t a, int16x2_t b, int32_t c) { @@ -1341,7 +1341,7 @@ // AArch32-LABEL: @test_smlsdx( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlsdx(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlsdx(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smlsdx(int16x2_t a, int16x2_t b, int32_t c) { @@ -1350,7 +1350,7 @@ // AArch32-LABEL: @test_smlsld( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlsld(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlsld(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i64 [[TMP0]] // int64_t test_smlsld(int16x2_t a, int16x2_t b, int64_t c) { @@ -1359,7 +1359,7 @@ // AArch32-LABEL: @test_smlsldx( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlsldx(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlsldx(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i64 [[TMP0]] // int64_t test_smlsldx(int16x2_t a, int16x2_t b, int64_t c) { @@ -1368,7 +1368,7 @@ // AArch32-LABEL: @test_smuad( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smuad(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smuad(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smuad(int16x2_t a, int16x2_t b) { @@ -1377,7 +1377,7 @@ // AArch32-LABEL: @test_smuadx( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smuadx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smuadx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smuadx(int16x2_t a, int16x2_t b) { @@ -1386,7 +1386,7 @@ // AArch32-LABEL: @test_smusd( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smusd(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smusd(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smusd(int16x2_t a, int16x2_t b) { @@ -1395,7 +1395,7 @@ // AArch32-LABEL: @test_smusdx( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smusdx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smusdx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // int32_t test_smusdx(int16x2_t a, int16x2_t b) { @@ -1407,13 +1407,13 @@ // AArch32-LABEL: @test_crc32b( // AArch32-NEXT: entry: // AArch32-NEXT: [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32 -// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32b(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32b(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP1]] // // AArch64-LABEL: @test_crc32b( // AArch64-NEXT: entry: // AArch64-NEXT: [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32 -// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32b(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32b(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR3]] // AArch64-NEXT: ret i32 [[TMP1]] // uint32_t test_crc32b(uint32_t a, uint8_t b) { @@ -1423,13 +1423,13 @@ // AArch32-LABEL: @test_crc32h( // AArch32-NEXT: entry: // AArch32-NEXT: [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32 -// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32h(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32h(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP1]] // // AArch64-LABEL: @test_crc32h( // AArch64-NEXT: entry: // AArch64-NEXT: [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32 -// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32h(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32h(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR3]] // AArch64-NEXT: ret i32 [[TMP1]] // uint32_t test_crc32h(uint32_t a, uint16_t b) { @@ -1438,12 +1438,12 @@ // AArch32-LABEL: @test_crc32w( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.crc32w(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.crc32w(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // // AArch64-LABEL: @test_crc32w( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32w(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32w(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i32 [[TMP0]] // uint32_t test_crc32w(uint32_t a, uint32_t b) { @@ -1455,13 +1455,13 @@ // AArch32-NEXT: [[TMP0:%.*]] = trunc i64 [[B:%.*]] to i32 // AArch32-NEXT: [[TMP1:%.*]] = lshr i64 [[B]], 32 // AArch32-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 -// AArch32-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.crc32w(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR1]] -// AArch32-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.crc32w(i32 [[TMP3]], i32 [[TMP2]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.crc32w(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]] +// AArch32-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.crc32w(i32 [[TMP3]], i32 [[TMP2]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP4]] // // AArch64-LABEL: @test_crc32d( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32x(i32 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32x(i32 [[A:%.*]], i64 [[B:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i32 [[TMP0]] // uint32_t test_crc32d(uint32_t a, uint64_t b) { @@ -1471,13 +1471,13 @@ // AArch32-LABEL: @test_crc32cb( // AArch32-NEXT: entry: // AArch32-NEXT: [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32 -// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32cb(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32cb(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP1]] // // AArch64-LABEL: @test_crc32cb( // AArch64-NEXT: entry: // AArch64-NEXT: [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32 -// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32cb(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32cb(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR3]] // AArch64-NEXT: ret i32 [[TMP1]] // uint32_t test_crc32cb(uint32_t a, uint8_t b) { @@ -1487,13 +1487,13 @@ // AArch32-LABEL: @test_crc32ch( // AArch32-NEXT: entry: // AArch32-NEXT: [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32 -// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32ch(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32ch(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP1]] // // AArch64-LABEL: @test_crc32ch( // AArch64-NEXT: entry: // AArch64-NEXT: [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32 -// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32ch(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32ch(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR3]] // AArch64-NEXT: ret i32 [[TMP1]] // uint32_t test_crc32ch(uint32_t a, uint16_t b) { @@ -1502,12 +1502,12 @@ // AArch32-LABEL: @test_crc32cw( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP0]] // // AArch64-LABEL: @test_crc32cw( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cw(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cw(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i32 [[TMP0]] // uint32_t test_crc32cw(uint32_t a, uint32_t b) { @@ -1519,13 +1519,13 @@ // AArch32-NEXT: [[TMP0:%.*]] = trunc i64 [[B:%.*]] to i32 // AArch32-NEXT: [[TMP1:%.*]] = lshr i64 [[B]], 32 // AArch32-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 -// AArch32-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR1]] -// AArch32-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[TMP3]], i32 [[TMP2]]) #[[ATTR1]] +// AArch32-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]] +// AArch32-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[TMP3]], i32 [[TMP2]]) [[ATTR1]] // AArch32-NEXT: ret i32 [[TMP4]] // // AArch64-LABEL: @test_crc32cd( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cx(i32 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR3]] +// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cx(i32 [[A:%.*]], i64 [[B:%.*]]) [[ATTR3]] // AArch64-NEXT: ret i32 [[TMP0]] // uint32_t test_crc32cd(uint32_t a, uint64_t b) { @@ -1535,12 +1535,12 @@ /* 10.1 Special register intrinsics */ // AArch32-LABEL: @test_rsr( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[META9:![0-9]+]]) +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[A32RSR32:!.*]]) // AArch32-NEXT: ret i32 [[TMP0]] // // AArch64-LABEL: @test_rsr( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META8:![0-9]+]]) +// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64RSR:!.*]]) // AArch64-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 // AArch64-NEXT: ret i32 [[TMP1]] // @@ -1554,12 +1554,12 @@ // AArch32-LABEL: @test_rsr64( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META10:![0-9]+]]) +// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A32RSR64:!.*]]) // AArch32-NEXT: ret i64 [[TMP0]] // // AArch64-LABEL: @test_rsr64( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META8]]) +// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64RSR]]) // AArch64-NEXT: ret i64 [[TMP0]] // uint64_t test_rsr64() { @@ -1572,13 +1572,13 @@ // AArch32-LABEL: @test_rsrp( // AArch32-NEXT: entry: -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[META11:![0-9]+]]) +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[A32SYSREG:!.*]]) // AArch32-NEXT: [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to i8* // AArch32-NEXT: ret i8* [[TMP1]] // // AArch64-LABEL: @test_rsrp( // AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META9:![0-9]+]]) +// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64SYSREG:!.*]]) // AArch64-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to i8* // AArch64-NEXT: ret i8* [[TMP1]] // @@ -1588,13 +1588,13 @@ // AArch32-LABEL: @test_wsr( // AArch32-NEXT: entry: -// AArch32-NEXT: call void @llvm.write_register.i32(metadata [[META9]], i32 [[V:%.*]]) +// AArch32-NEXT: call void @llvm.write_register.i32(metadata [[A32RSR32]], i32 [[V:%.*]]) // AArch32-NEXT: ret void // // AArch64-LABEL: @test_wsr( // AArch64-NEXT: entry: // AArch64-NEXT: [[TMP0:%.*]] = zext i32 [[V:%.*]] to i64 -// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[META8]], i64 [[TMP0]]) +// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[A64RSR]], i64 [[TMP0]]) // AArch64-NEXT: ret void // void test_wsr(uint32_t v) { @@ -1607,12 +1607,12 @@ // AArch32-LABEL: @test_wsr64( // AArch32-NEXT: entry: -// AArch32-NEXT: call void @llvm.write_register.i64(metadata [[META10]], i64 [[V:%.*]]) +// AArch32-NEXT: call void @llvm.write_register.i64(metadata [[A32RSR64]], i64 [[V:%.*]]) // AArch32-NEXT: ret void // // AArch64-LABEL: @test_wsr64( // AArch64-NEXT: entry: -// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[META8]], i64 [[V:%.*]]) +// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[A64RSR]], i64 [[V:%.*]]) // AArch64-NEXT: ret void // void test_wsr64(uint64_t v) { @@ -1626,13 +1626,13 @@ // AArch32-LABEL: @test_wsrp( // AArch32-NEXT: entry: // AArch32-NEXT: [[TMP0:%.*]] = ptrtoint i8* [[V:%.*]] to i32 -// AArch32-NEXT: call void @llvm.write_register.i32(metadata [[META11]], i32 [[TMP0]]) +// AArch32-NEXT: call void @llvm.write_register.i32(metadata [[A32SYSREG]], i32 [[TMP0]]) // AArch32-NEXT: ret void // // AArch64-LABEL: @test_wsrp( // AArch64-NEXT: entry: // AArch64-NEXT: [[TMP0:%.*]] = ptrtoint i8* [[V:%.*]] to i64 -// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[META9]], i64 [[TMP0]]) +// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[A64SYSREG]], i64 [[TMP0]]) // AArch64-NEXT: ret void // void test_wsrp(void *v) { @@ -1642,7 +1642,7 @@ // AArch32-LABEL: @test_rsrf( // AArch32-NEXT: entry: // AArch32-NEXT: [[REF_TMP:%.*]] = alloca i32, align 4 -// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[META9]]) +// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[A32RSR32]]) // AArch32-NEXT: store i32 [[TMP0]], i32* [[REF_TMP]], align 4 // AArch32-NEXT: [[TMP1:%.*]] = bitcast i32* [[REF_TMP]] to float* // AArch32-NEXT: [[TMP2:%.*]] = load float, float* [[TMP1]], align 4 @@ -1651,7 +1651,7 @@ // AArch64-LABEL: @test_rsrf( // AArch64-NEXT: entry: // AArch64-NEXT: [[REF_TMP:%.*]] = alloca i32, align 4 -// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META8]]) +// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64RSR]]) // AArch64-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 // AArch64-NEXT: store i32 [[TMP1]], i32* [[REF_TMP]], align 4 // AArch64-NEXT: [[TMP2:%.*]] = bitcast i32* [[REF_TMP]] to float* @@ -1669,7 +1669,7 @@ // AArch32-LABEL: @test_rsrf64( // AArch32-NEXT: entry: // AArch32-NEXT: [[REF_TMP:%.*]] = alloca i64, align 8 -// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META10]]) +// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A32RSR64]]) // AArch32-NEXT: store i64 [[TMP0]], i64* [[REF_TMP]], align 8 // AArch32-NEXT: [[TMP1:%.*]] = bitcast i64* [[REF_TMP]] to double* // AArch32-NEXT: [[TMP2:%.*]] = load double, double* [[TMP1]], align 8 @@ -1678,7 +1678,7 @@ // AArch64-LABEL: @test_rsrf64( // AArch64-NEXT: entry: // AArch64-NEXT: [[REF_TMP:%.*]] = alloca i64, align 8 -// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META8]]) +// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64RSR]]) // AArch64-NEXT: store i64 [[TMP0]], i64* [[REF_TMP]], align 8 // AArch64-NEXT: [[TMP1:%.*]] = bitcast i64* [[REF_TMP]] to double* // AArch64-NEXT: [[TMP2:%.*]] = load double, double* [[TMP1]], align 8 @@ -1698,7 +1698,7 @@ // AArch32-NEXT: store float [[V:%.*]], float* [[V_ADDR]], align 4 // AArch32-NEXT: [[TMP0:%.*]] = bitcast float* [[V_ADDR]] to i32* // AArch32-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// AArch32-NEXT: call void @llvm.write_register.i32(metadata [[META9]], i32 [[TMP1]]) +// AArch32-NEXT: call void @llvm.write_register.i32(metadata [[A32RSR32]], i32 [[TMP1]]) // AArch32-NEXT: ret void // // AArch64-LABEL: @test_wsrf( @@ -1708,7 +1708,7 @@ // AArch64-NEXT: [[TMP0:%.*]] = bitcast float* [[V_ADDR]] to i32* // AArch64-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 // AArch64-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[META8]], i64 [[TMP2]]) +// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[A64RSR]], i64 [[TMP2]]) // AArch64-NEXT: ret void // void test_wsrf(float v) { @@ -1725,7 +1725,7 @@ // AArch32-NEXT: store double [[V:%.*]], double* [[V_ADDR]], align 8 // AArch32-NEXT: [[TMP0:%.*]] = bitcast double* [[V_ADDR]] to i64* // AArch32-NEXT: [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 8 -// AArch32-NEXT: call void @llvm.write_register.i64(metadata [[META10]], i64 [[TMP1]]) +// AArch32-NEXT: call void @llvm.write_register.i64(metadata [[A32RSR64]], i64 [[TMP1]]) // AArch32-NEXT: ret void // // AArch64-LABEL: @test_wsrf64( @@ -1734,7 +1734,7 @@ // AArch64-NEXT: store double [[V:%.*]], double* [[V_ADDR]], align 8 // AArch64-NEXT: [[TMP0:%.*]] = bitcast double* [[V_ADDR]] to i64* // AArch64-NEXT: [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 8 -// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[META8]], i64 [[TMP1]]) +// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[A64RSR]], i64 [[TMP1]]) // AArch64-NEXT: ret void // void test_wsrf64(double v) { @@ -1748,7 +1748,7 @@ #ifdef __ARM_64BIT_STATE // AArch6483-LABEL: @test_jcvt( // AArch6483-NEXT: entry: -// AArch6483-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.fjcvtzs(double [[V:%.*]]) #[[ATTR3]] +// AArch6483-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.fjcvtzs(double [[V:%.*]]) [[ATTR3:#.*]] // AArch6483-NEXT: ret i32 [[TMP0]] // int32_t test_jcvt(double v) { @@ -1759,31 +1759,36 @@ #if __ARM_64BIT_STATE && defined(__ARM_FEATURE_RNG) -// AArch64-LABEL: @test_rndr( -// AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.aarch64.rndr() #[[ATTR3]] -// AArch64-NEXT: [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0 -// AArch64-NEXT: [[TMP2:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1 -// AArch64-NEXT: store i64 [[TMP1]], i64* [[__ADDR:%.*]], align 8 -// AArch64-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i32 -// AArch64-NEXT: ret i32 [[TMP3]] +// AArch6485-LABEL: @test_rndr( +// AArch6485-NEXT: entry: +// AArch6485-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.aarch64.rndr() [[ATTR3:#.*]] +// AArch6485-NEXT: [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0 +// AArch6485-NEXT: [[TMP2:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1 +// AArch6485-NEXT: store i64 [[TMP1]], i64* [[__ADDR:%.*]], align 8 +// AArch6485-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i32 +// AArch6485-NEXT: ret i32 [[TMP3]] // int test_rndr(uint64_t *__addr) { return __rndr(__addr); } -// AArch64-LABEL: @test_rndrrs( -// AArch64-NEXT: entry: -// AArch64-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.aarch64.rndrrs() #[[ATTR3]] -// AArch64-NEXT: [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0 -// AArch64-NEXT: [[TMP2:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1 -// AArch64-NEXT: store i64 [[TMP1]], i64* [[__ADDR:%.*]], align 8 -// AArch64-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i32 -// AArch64-NEXT: ret i32 [[TMP3]] +// AArch6485-LABEL: @test_rndrrs( +// AArch6485-NEXT: entry: +// AArch6485-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.aarch64.rndrrs() [[ATTR3:#.*]] +// AArch6485-NEXT: [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0 +// AArch6485-NEXT: [[TMP2:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1 +// AArch6485-NEXT: store i64 [[TMP1]], i64* [[__ADDR:%.*]], align 8 +// AArch6485-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i32 +// AArch6485-NEXT: ret i32 [[TMP3]] // int test_rndrrs(uint64_t *__addr) { return __rndrrs(__addr); } #endif +// AArch32: [[A32RSR32]] = !{!"cp1:2:c3:c4:5"} +// AArch32: [[A32RSR64]] = !{!"cp1:2:c3"} +// AArch32: [[A32SYSREG]] = !{!"sysreg"} +// AArch64: [[A64RSR]] = !{!"1:2:3:4:5"} +// AArch64: [[A64SYSREG]] = !{!"sysreg"} Index: clang/test/CodeGen/memcpy-inline-builtin.c =================================================================== --- clang/test/CodeGen/memcpy-inline-builtin.c +++ clang/test/CodeGen/memcpy-inline-builtin.c @@ -32,11 +32,11 @@ // CHECK-NEXT: store i8* [[TMP0]], i8** [[A_ADDR_I]], align 8 // CHECK-NEXT: store i8* [[TMP1]], i8** [[B_ADDR_I]], align 8 // CHECK-NEXT: store i64 [[TMP2]], i64* [[C_ADDR_I]], align 8 -// CHECK-NEXT: call void asm sideeffect "# memcpy.inline marker", "~{dirflag},~{fpsr},~{flags}"() #[[ATTR3:[0-9]+]], !srcloc !2 +// CHECK-NEXT: call void asm sideeffect "# memcpy.inline marker", "~{dirflag},~{fpsr},~{flags}"() #[[ATTR4:[0-9]+]], !srcloc !2 // CHECK-NEXT: [[TMP3:%.*]] = load i8*, i8** [[A_ADDR_I]], align 8 // CHECK-NEXT: [[TMP4:%.*]] = load i8*, i8** [[B_ADDR_I]], align 8 // CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* [[C_ADDR_I]], align 8 -// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[TMP3]], i8* align 1 [[TMP4]], i64 [[TMP5]], i1 false) #[[ATTR3]] +// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[TMP3]], i8* align 1 [[TMP4]], i64 [[TMP5]], i1 false) #[[ATTR4]] // CHECK-NEXT: ret i8* [[TMP3]] // void *foo(void *a, const void *b, size_t c) { Index: clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp =================================================================== --- clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp +++ clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp @@ -649,25 +649,25 @@ // CHECK1-NEXT: store i32 [[CONV3_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14 // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK1: omp.inner.for.cond.i: -// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15:![0-9]+]] +// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15 // CHECK1-NEXT: [[CONV4_I:%.*]] = sext i32 [[TMP46]] to i64 -// CHECK1-NEXT: [[TMP47:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]] +// CHECK1-NEXT: [[TMP47:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14, !llvm.access.group !15 // CHECK1-NEXT: [[CMP_I:%.*]] = icmp ule i64 [[CONV4_I]], [[TMP47]] // CHECK1-NEXT: br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__9_EXIT:%.*]] // CHECK1: omp.inner.for.body.i: -// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]] -// CHECK1-NEXT: store i32 [[TMP48]], i32* [[I_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]] -// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group [[ACC_GRP15]] +// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15 +// CHECK1-NEXT: store i32 [[TMP48]], i32* [[I_I]], align 4, !noalias !14, !llvm.access.group !15 +// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group !15 // CHECK1-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[TMP49]] to i64 // CHECK1-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i16, i16* [[CONV2_I]], i64 [[IDXPROM_I]] -// CHECK1-NEXT: [[TMP50:%.*]] = load i16, i16* [[ARRAYIDX_I]], align 2, !llvm.access.group [[ACC_GRP15]] +// CHECK1-NEXT: [[TMP50:%.*]] = load i16, i16* [[ARRAYIDX_I]], align 2, !llvm.access.group !15 // CHECK1-NEXT: [[CONV5_I:%.*]] = sext i16 [[TMP50]] to i32 -// CHECK1-NEXT: [[TMP51:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group [[ACC_GRP15]] +// CHECK1-NEXT: [[TMP51:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group !15 // CHECK1-NEXT: [[ADD6_I:%.*]] = add nsw i32 [[TMP51]], [[CONV5_I]] -// CHECK1-NEXT: store i32 [[ADD6_I]], i32* [[CONV_I]], align 4, !llvm.access.group [[ACC_GRP15]] -// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]] +// CHECK1-NEXT: store i32 [[ADD6_I]], i32* [[CONV_I]], align 4, !llvm.access.group !15 +// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15 // CHECK1-NEXT: [[ADD7_I:%.*]] = add nsw i32 [[TMP52]], 1 -// CHECK1-NEXT: store i32 [[ADD7_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]] +// CHECK1-NEXT: store i32 [[ADD7_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15 // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP16:![0-9]+]] // CHECK1: .omp_outlined..9.exit: // CHECK1-NEXT: ret i32 0 @@ -761,31 +761,31 @@ // CHECK3-NEXT: store i32 [[CONV]], i32* [[DOTOMP_IV]], align 4 // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2:![0-9]+]] +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2 // CHECK3-NEXT: [[CONV2:%.*]] = sext i32 [[TMP4]] to i64 -// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group !2 // CHECK3-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP5]] // CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2 // CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP2]] -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group !2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group !2 // CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP7]] to i64 // CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[IDXPROM]] -// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX]], align 2, !llvm.access.group !2 // CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP8]] to i32 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group !2 // CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP9]], [[CONV3]] -// CHECK3-NEXT: store i32 [[ADD4]], i32* [[A]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: store i32 [[ADD4]], i32* [[A]], align 4, !llvm.access.group !2 // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2 // CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK3-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2 // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK3: omp.inner.for.end: // CHECK3-NEXT: store i32 5, i32* [[I]], align 4 Index: clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp =================================================================== --- clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp +++ clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp @@ -449,7 +449,7 @@ // CHECK1-NEXT: [[TMP36:%.*]] = sdiv exact i64 [[TMP35]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64) // CHECK1-NEXT: [[TMP37:%.*]] = add nuw i64 [[TMP36]], 1 // CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64) -// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12 +// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8 // CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8 // CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2 Index: clang/test/OpenMP/sections_reduction_task_codegen.cpp =================================================================== --- clang/test/OpenMP/sections_reduction_task_codegen.cpp +++ clang/test/OpenMP/sections_reduction_task_codegen.cpp @@ -487,7 +487,7 @@ // CHECK1-NEXT: [[TMP36:%.*]] = sdiv exact i64 [[TMP35]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64) // CHECK1-NEXT: [[TMP37:%.*]] = add nuw i64 [[TMP36]], 1 // CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64) -// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12 +// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8 // CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8 // CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2 Index: clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp =================================================================== --- clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp +++ clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp @@ -453,7 +453,7 @@ // CHECK1-NEXT: [[TMP36:%.*]] = sdiv exact i64 [[TMP35]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64) // CHECK1-NEXT: [[TMP37:%.*]] = add nuw i64 [[TMP36]], 1 // CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64) -// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12 +// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8 // CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8 // CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2 Index: clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp =================================================================== --- clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp +++ clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp @@ -853,7 +853,7 @@ // CHECK1-NEXT: [[TMP36:%.*]] = sdiv exact i64 [[TMP35]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64) // CHECK1-NEXT: [[TMP37:%.*]] = add nuw i64 [[TMP36]], 1 // CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64) -// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12 +// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8 // CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8 // CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2 Index: clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp =================================================================== --- clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp +++ clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp @@ -642,25 +642,25 @@ // CHECK1-NEXT: store i32 [[CONV3_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14 // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK1: omp.inner.for.cond.i: -// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15:![0-9]+]] +// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15 // CHECK1-NEXT: [[CONV4_I:%.*]] = sext i32 [[TMP46]] to i64 -// CHECK1-NEXT: [[TMP47:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]] +// CHECK1-NEXT: [[TMP47:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14, !llvm.access.group !15 // CHECK1-NEXT: [[CMP_I:%.*]] = icmp ule i64 [[CONV4_I]], [[TMP47]] // CHECK1-NEXT: br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__9_EXIT:%.*]] // CHECK1: omp.inner.for.body.i: -// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]] -// CHECK1-NEXT: store i32 [[TMP48]], i32* [[I_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]] -// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group [[ACC_GRP15]] +// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15 +// CHECK1-NEXT: store i32 [[TMP48]], i32* [[I_I]], align 4, !noalias !14, !llvm.access.group !15 +// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group !15 // CHECK1-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[TMP49]] to i64 // CHECK1-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i16, i16* [[CONV2_I]], i64 [[IDXPROM_I]] -// CHECK1-NEXT: [[TMP50:%.*]] = load i16, i16* [[ARRAYIDX_I]], align 2, !llvm.access.group [[ACC_GRP15]] +// CHECK1-NEXT: [[TMP50:%.*]] = load i16, i16* [[ARRAYIDX_I]], align 2, !llvm.access.group !15 // CHECK1-NEXT: [[CONV5_I:%.*]] = sext i16 [[TMP50]] to i32 -// CHECK1-NEXT: [[TMP51:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group [[ACC_GRP15]] +// CHECK1-NEXT: [[TMP51:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group !15 // CHECK1-NEXT: [[ADD6_I:%.*]] = add nsw i32 [[TMP51]], [[CONV5_I]] -// CHECK1-NEXT: store i32 [[ADD6_I]], i32* [[CONV_I]], align 4, !llvm.access.group [[ACC_GRP15]] -// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]] +// CHECK1-NEXT: store i32 [[ADD6_I]], i32* [[CONV_I]], align 4, !llvm.access.group !15 +// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15 // CHECK1-NEXT: [[ADD7_I:%.*]] = add nsw i32 [[TMP52]], 1 -// CHECK1-NEXT: store i32 [[ADD7_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]] +// CHECK1-NEXT: store i32 [[ADD7_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15 // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP16:![0-9]+]] // CHECK1: .omp_outlined..9.exit: // CHECK1-NEXT: ret i32 0 @@ -754,31 +754,31 @@ // CHECK3-NEXT: store i32 [[CONV]], i32* [[DOTOMP_IV]], align 4 // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2:![0-9]+]] +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2 // CHECK3-NEXT: [[CONV2:%.*]] = sext i32 [[TMP4]] to i64 -// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group !2 // CHECK3-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP5]] // CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2 // CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP2]] -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group !2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group !2 // CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP7]] to i64 // CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[IDXPROM]] -// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX]], align 2, !llvm.access.group !2 // CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP8]] to i32 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group !2 // CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP9]], [[CONV3]] -// CHECK3-NEXT: store i32 [[ADD4]], i32* [[A]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: store i32 [[ADD4]], i32* [[A]], align 4, !llvm.access.group !2 // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2 // CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK3-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2 // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK3: omp.inner.for.end: // CHECK3-NEXT: store i32 5, i32* [[I]], align 4 Index: clang/unittests/Format/TokenAnnotatorTest.cpp =================================================================== --- clang/unittests/Format/TokenAnnotatorTest.cpp +++ clang/unittests/Format/TokenAnnotatorTest.cpp @@ -549,323 +549,171 @@ } TEST_F(TokenAnnotatorTest, RequiresDoesNotChangeParsingOfTheRest) { - auto NumberOfAdditionalRequiresClauseTokens = 5u; - auto NumberOfTokensBeforeRequires = 5u; - - auto BaseTokens = annotate("template\n" - "T Pi = 3.14;"); - auto ConstrainedTokens = annotate("template\n" - " requires Foo\n" - "T Pi = 3.14;"); - - auto NumberOfBaseTokens = 11u; - - ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens; - ASSERT_EQ(ConstrainedTokens.size(), - NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens) - << ConstrainedTokens; - - for (auto I = 0u; I < NumberOfBaseTokens; ++I) { - if (I < NumberOfTokensBeforeRequires) { - EXPECT_EQ(*BaseTokens[I], *ConstrainedTokens[I]) << I; - } else { - EXPECT_EQ(*BaseTokens[I], - *ConstrainedTokens[I + NumberOfAdditionalRequiresClauseTokens]) - << I; + const char *BaseCode = nullptr; + const char *ConstrainedCode = nullptr; + auto BaseTokenCount = 0u; + auto RequiresTokenCount = 0u; + auto PrefixTokenCount = 0u; + + auto TestRequires = [&](int Line) { + const auto BaseTokens = annotate(BaseCode); + const auto ConstrainedTokens = annotate(ConstrainedCode); + +#define LINE " (Line " << Line << ')' + + ASSERT_EQ(BaseTokens.size(), BaseTokenCount) << BaseTokens << LINE; + ASSERT_EQ(ConstrainedTokens.size(), BaseTokenCount + RequiresTokenCount) + << LINE; + + for (auto I = 0u; I < BaseTokenCount; ++I) { + EXPECT_EQ( + *BaseTokens[I], + *ConstrainedTokens[I < PrefixTokenCount ? I : I + RequiresTokenCount]) + << I << LINE; } - } - BaseTokens = annotate("template\n" - "struct Bar;"); - ConstrainedTokens = annotate("template\n" - " requires Foo\n" - "struct Bar;"); - NumberOfBaseTokens = 9u; - - ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens; - ASSERT_EQ(ConstrainedTokens.size(), - NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens) - << ConstrainedTokens; - - for (auto I = 0u; I < NumberOfBaseTokens; ++I) { - if (I < NumberOfTokensBeforeRequires) { - EXPECT_EQ(*BaseTokens[I], *ConstrainedTokens[I]) << I; - } else { - EXPECT_EQ(*BaseTokens[I], - *ConstrainedTokens[I + NumberOfAdditionalRequiresClauseTokens]) - << I; - } - } - - BaseTokens = annotate("template\n" - "struct Bar {" - " T foo();\n" - " T bar();\n" - "};"); - ConstrainedTokens = annotate("template\n" - " requires Foo\n" - "struct Bar {" - " T foo();\n" - " T bar();\n" - "};"); - NumberOfBaseTokens = 21u; - - ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens; - ASSERT_EQ(ConstrainedTokens.size(), - NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens) - << ConstrainedTokens; - - for (auto I = 0u; I < NumberOfBaseTokens; ++I) { - if (I < NumberOfTokensBeforeRequires) { - EXPECT_EQ(*BaseTokens[I], *ConstrainedTokens[I]) << I; - } else { - EXPECT_EQ(*BaseTokens[I], - *ConstrainedTokens[I + NumberOfAdditionalRequiresClauseTokens]) - << I; - } - } - - BaseTokens = annotate("template\n" - "Bar(T) -> Bar;"); - ConstrainedTokens = annotate("template\n" - " requires Foo\n" - "Bar(T) -> Bar;"); - NumberOfBaseTokens = 16u; - - ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens; - ASSERT_EQ(ConstrainedTokens.size(), - NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens) - << ConstrainedTokens; - - for (auto I = 0u; I < NumberOfBaseTokens; ++I) { - if (I < NumberOfTokensBeforeRequires) { - EXPECT_EQ(*BaseTokens[I], *ConstrainedTokens[I]) << I; - } else { - EXPECT_EQ(*BaseTokens[I], - *ConstrainedTokens[I + NumberOfAdditionalRequiresClauseTokens]) - << I; - } - } - - BaseTokens = annotate("template\n" - "T foo();"); - ConstrainedTokens = annotate("template\n" - " requires Foo\n" - "T foo();"); - NumberOfBaseTokens = 11u; - - ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens; - ASSERT_EQ(ConstrainedTokens.size(), - NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens) - << ConstrainedTokens; - - for (auto I = 0u; I < NumberOfBaseTokens; ++I) { - if (I < NumberOfTokensBeforeRequires) { - EXPECT_EQ(*BaseTokens[I], *ConstrainedTokens[I]) << I; - } else { - EXPECT_EQ(*BaseTokens[I], - *ConstrainedTokens[I + NumberOfAdditionalRequiresClauseTokens]) - << I; - } - } - - BaseTokens = annotate("template\n" - "T foo() {\n" - " auto bar = baz();\n" - " return bar + T{};\n" - "}"); - ConstrainedTokens = annotate("template\n" - " requires Foo\n" - "T foo() {\n" - " auto bar = baz();\n" - " return bar + T{};\n" - "}"); - NumberOfBaseTokens = 26u; - - ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens; - ASSERT_EQ(ConstrainedTokens.size(), - NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens) - << ConstrainedTokens; - - for (auto I = 0u; I < NumberOfBaseTokens; ++I) { - if (I < NumberOfTokensBeforeRequires) { - EXPECT_EQ(*BaseTokens[I], *ConstrainedTokens[I]) << I; - } else { - EXPECT_EQ(*BaseTokens[I], - *ConstrainedTokens[I + NumberOfAdditionalRequiresClauseTokens]) - << I; - } - } - - BaseTokens = annotate("template\n" - "T foo();"); - ConstrainedTokens = annotate("template\n" - "T foo() requires Foo;"); - NumberOfBaseTokens = 11u; - NumberOfTokensBeforeRequires = 9u; - - ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens; - ASSERT_EQ(ConstrainedTokens.size(), - NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens) - << ConstrainedTokens; - - for (auto I = 0u; I < NumberOfBaseTokens; ++I) { - if (I < NumberOfTokensBeforeRequires) { - EXPECT_EQ(*BaseTokens[I], *ConstrainedTokens[I]) << I; - } else { - EXPECT_EQ(*BaseTokens[I], - *ConstrainedTokens[I + NumberOfAdditionalRequiresClauseTokens]) - << I; - } - } - - BaseTokens = annotate("template\n" - "T foo() {\n" - " auto bar = baz();\n" - " return bar + T{};\n" - "}"); - ConstrainedTokens = annotate("template\n" - "T foo() requires Foo {\n" - " auto bar = baz();\n" - " return bar + T{};\n" - "}"); - NumberOfBaseTokens = 26u; - - ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens; - ASSERT_EQ(ConstrainedTokens.size(), - NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens) - << ConstrainedTokens; - - for (auto I = 0u; I < NumberOfBaseTokens; ++I) { - if (I < NumberOfTokensBeforeRequires) { - EXPECT_EQ(*BaseTokens[I], *ConstrainedTokens[I]) << I; - } else { - EXPECT_EQ(*BaseTokens[I], - *ConstrainedTokens[I + NumberOfAdditionalRequiresClauseTokens]) - << I; - } - } - - BaseTokens = annotate("template\n" - "Bar(T) -> Bar;"); - ConstrainedTokens = annotate("template\n" - " requires requires(T &&t) {\n" - " typename T::I;\n" - " }\n" - "Bar(T) -> Bar;"); - NumberOfBaseTokens = 19u; - NumberOfAdditionalRequiresClauseTokens = 14u; - NumberOfTokensBeforeRequires = 5u; - - ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens; - ASSERT_EQ(ConstrainedTokens.size(), - NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens) - << ConstrainedTokens; - - for (auto I = 0u; I < NumberOfBaseTokens; ++I) { - if (I < NumberOfTokensBeforeRequires) { - EXPECT_EQ(*BaseTokens[I], *ConstrainedTokens[I]) << I; - } else { - EXPECT_EQ(*BaseTokens[I], - *ConstrainedTokens[I + NumberOfAdditionalRequiresClauseTokens]) - << I; - } - } - - BaseTokens = annotate("struct [[nodiscard]] zero_t {\n" - " template\n" - " [[nodiscard]] constexpr operator T() const { " - "return number_zero_v; }\n" - "};"); - - ConstrainedTokens = annotate("struct [[nodiscard]] zero_t {\n" - " template\n" - " requires requires { number_zero_v; }\n" - " [[nodiscard]] constexpr operator T() const { " - "return number_zero_v; }\n" - "};"); - NumberOfBaseTokens = 35u; - NumberOfAdditionalRequiresClauseTokens = 9u; - NumberOfTokensBeforeRequires = 13u; - - ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens; - ASSERT_EQ(ConstrainedTokens.size(), - NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens) - << ConstrainedTokens; - - for (auto I = 0u; I < NumberOfBaseTokens; ++I) { - if (I < NumberOfTokensBeforeRequires) { - EXPECT_EQ(*BaseTokens[I], *ConstrainedTokens[I]) << I; - } else { - EXPECT_EQ(*BaseTokens[I], - *ConstrainedTokens[I + NumberOfAdditionalRequiresClauseTokens]) - << I; - } - } - - BaseTokens = annotate("constexpr Foo(Foo const &other)\n" - " : value{other.value} {\n" - " do_magic();\n" - " do_more_magic();\n" - "}"); - - ConstrainedTokens = annotate("constexpr Foo(Foo const &other)\n" - " requires std::is_copy_constructible\n" - " : value{other.value} {\n" - " do_magic();\n" - " do_more_magic();\n" - "}"); - - NumberOfBaseTokens = 26u; - NumberOfAdditionalRequiresClauseTokens = 7u; - NumberOfTokensBeforeRequires = 8u; - - ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens; - ASSERT_EQ(ConstrainedTokens.size(), - NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens) - << ConstrainedTokens; - - for (auto I = 0u; I < NumberOfBaseTokens; ++I) { - if (I < NumberOfTokensBeforeRequires) { - EXPECT_EQ(*BaseTokens[I], *ConstrainedTokens[I]) << I; - } else { - EXPECT_EQ(*BaseTokens[I], - *ConstrainedTokens[I + NumberOfAdditionalRequiresClauseTokens]) - << I; - } - } - - BaseTokens = annotate("constexpr Foo(Foo const &other)\n" - " : value{other.value} {\n" - " do_magic();\n" - " do_more_magic();\n" - "}"); - - ConstrainedTokens = annotate("constexpr Foo(Foo const &other)\n" - " requires (std::is_copy_constructible)\n" - " : value{other.value} {\n" - " do_magic();\n" - " do_more_magic();\n" - "}"); - - NumberOfBaseTokens = 26u; - NumberOfAdditionalRequiresClauseTokens = 9u; - NumberOfTokensBeforeRequires = 8u; - - ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens; - ASSERT_EQ(ConstrainedTokens.size(), - NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens) - << ConstrainedTokens; - - for (auto I = 0u; I < NumberOfBaseTokens; ++I) { - if (I < NumberOfTokensBeforeRequires) { - EXPECT_EQ(*BaseTokens[I], *ConstrainedTokens[I]) << I; - } else { - EXPECT_EQ(*BaseTokens[I], - *ConstrainedTokens[I + NumberOfAdditionalRequiresClauseTokens]) - << I; - } - } +#undef LINE + }; + + BaseCode = "template\n" + "T Pi = 3.14;"; + ConstrainedCode = "template\n" + " requires Foo\n" + "T Pi = 3.14;"; + BaseTokenCount = 11; + RequiresTokenCount = 5; + PrefixTokenCount = 5; + TestRequires(__LINE__); + + BaseCode = "template\n" + "struct Bar;"; + ConstrainedCode = "template\n" + " requires Foo\n" + "struct Bar;"; + BaseTokenCount = 9; + TestRequires(__LINE__); + + BaseCode = "template\n" + "struct Bar {\n" + " T foo();\n" + " T bar();\n" + "};"; + ConstrainedCode = "template\n" + " requires Foo\n" + "struct Bar {\n" + " T foo();\n" + " T bar();\n" + "};"; + BaseTokenCount = 21; + TestRequires(__LINE__); + + BaseCode = "template\n" + "Bar(T) -> Bar;"; + ConstrainedCode = "template\n" + " requires Foo\n" + "Bar(T) -> Bar;"; + BaseTokenCount = 16; + TestRequires(__LINE__); + + BaseCode = "template\n" + "T foo();"; + ConstrainedCode = "template\n" + " requires Foo\n" + "T foo();"; + BaseTokenCount = 11; + TestRequires(__LINE__); + + BaseCode = "template\n" + "T foo() {\n" + " auto bar = baz();\n" + " return bar + T{};\n" + "}"; + ConstrainedCode = "template\n" + " requires Foo\n" + "T foo() {\n" + " auto bar = baz();\n" + " return bar + T{};\n" + "}"; + BaseTokenCount = 26; + TestRequires(__LINE__); + + BaseCode = "template\n" + "T foo();"; + ConstrainedCode = "template\n" + "T foo() requires Foo;"; + BaseTokenCount = 11; + PrefixTokenCount = 9; + TestRequires(__LINE__); + + BaseCode = "template\n" + "T foo() {\n" + " auto bar = baz();\n" + " return bar + T{};\n" + "}"; + ConstrainedCode = "template\n" + "T foo() requires Foo {\n" + " auto bar = baz();\n" + " return bar + T{};\n" + "}"; + BaseTokenCount = 26; + TestRequires(__LINE__); + + BaseCode = "template\n" + "Bar(T) -> Bar;"; + ConstrainedCode = "template\n" + " requires requires(T &&t) {\n" + " typename T::I;\n" + " }\n" + "Bar(T) -> Bar;"; + BaseTokenCount = 19; + RequiresTokenCount = 14; + PrefixTokenCount = 5; + TestRequires(__LINE__); + + BaseCode = "struct [[nodiscard]] zero_t {\n" + " template\n" + " [[nodiscard]] constexpr operator T() const { return v; }\n" + "};"; + ConstrainedCode = + "struct [[nodiscard]] zero_t {\n" + " template\n" + " requires requires { v; }\n" + " [[nodiscard]] constexpr operator T() const { return v; }\n" + "};"; + BaseTokenCount = 35; + RequiresTokenCount = 9; + PrefixTokenCount = 13; + TestRequires(__LINE__); + + BaseCode = "constexpr Foo(Foo const &other)\n" + " : value{other.value} {\n" + " do_magic();\n" + " do_more_magic();\n" + "}"; + ConstrainedCode = "constexpr Foo(Foo const &other)\n" + " requires std::is_copy_constructible\n" + " : value{other.value} {\n" + " do_magic();\n" + " do_more_magic();\n" + "}"; + BaseTokenCount = 26; + RequiresTokenCount = 7; + PrefixTokenCount = 8; + TestRequires(__LINE__); + + BaseCode = "constexpr Foo(Foo const &other)\n" + " : value{other.value} {\n" + " do_magic();\n" + " do_more_magic();\n" + "}"; + ConstrainedCode = "constexpr Foo(Foo const &other)\n" + " requires (std::is_copy_constructible)\n" + " : value{other.value} {\n" + " do_magic();\n" + " do_more_magic();\n" + "}"; + RequiresTokenCount = 9; + TestRequires(__LINE__); } TEST_F(TokenAnnotatorTest, UnderstandsAsm) { Index: compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp =================================================================== --- compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp +++ compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp @@ -72,6 +72,7 @@ // symbolication. static void InitializeSwiftDemangler() { swift_demangle_f = (swift_demangle_ft)dlsym(RTLD_DEFAULT, "swift_demangle"); + (void)dlerror(); // Cleanup error message in case of failure } // Attempts to demangle a Swift name. The demangler will return nullptr if a Index: llvm/include/llvm/Analysis/LoopInfo.h =================================================================== --- llvm/include/llvm/Analysis/LoopInfo.h +++ llvm/include/llvm/Analysis/LoopInfo.h @@ -814,15 +814,12 @@ /// by one each time through the loop. bool isCanonical(ScalarEvolution &SE) const; - /// Return true if the Loop is in LCSSA form. If \p IgnoreTokens is set to - /// true, token values defined inside loop are allowed to violate LCSSA form. - bool isLCSSAForm(const DominatorTree &DT, bool IgnoreTokens = true) const; - - /// Return true if this Loop and all inner subloops are in LCSSA form. If \p - /// IgnoreTokens is set to true, token values defined inside loop are allowed - /// to violate LCSSA form. - bool isRecursivelyLCSSAForm(const DominatorTree &DT, const LoopInfo &LI, - bool IgnoreTokens = true) const; + /// Return true if the Loop is in LCSSA form. + bool isLCSSAForm(const DominatorTree &DT) const; + + /// Return true if this Loop and all inner subloops are in LCSSA form. + bool isRecursivelyLCSSAForm(const DominatorTree &DT, + const LoopInfo &LI) const; /// Return true if the Loop is in the form that the LoopSimplify form /// transforms loops to, which is sometimes called normal form. Index: llvm/include/llvm/Support/JSON.h =================================================================== --- llvm/include/llvm/Support/JSON.h +++ llvm/include/llvm/Support/JSON.h @@ -169,36 +169,44 @@ emplace_back(V); } - Value &operator[](size_t I); - const Value &operator[](size_t I) const; - Value &front(); - const Value &front() const; - Value &back(); - const Value &back() const; - Value *data(); - const Value *data() const; - - iterator begin(); - const_iterator begin() const; - iterator end(); - const_iterator end() const; - - bool empty() const; - size_t size() const; - void reserve(size_t S); - - void clear(); - void push_back(const Value &E); - void push_back(Value &&E); - template void emplace_back(Args &&...A); - void pop_back(); + Value &operator[](size_t I) { return V[I]; } + const Value &operator[](size_t I) const { return V[I]; } + Value &front() { return V.front(); } + const Value &front() const { return V.front(); } + Value &back() { return V.back(); } + const Value &back() const { return V.back(); } + Value *data() { return V.data(); } + const Value *data() const { return V.data(); } + + iterator begin() { return V.begin(); } + const_iterator begin() const { return V.begin(); } + iterator end() { return V.end(); } + const_iterator end() const { return V.end(); } + + bool empty() const { return V.empty(); } + size_t size() const { return V.size(); } + void reserve(size_t S) { V.reserve(S); } + + void clear() { V.clear(); } + void push_back(const Value &E) { V.push_back(E); } + void push_back(Value &&E) { V.push_back(std::move(E)); } + template void emplace_back(Args &&... A) { + V.emplace_back(std::forward(A)...); + } + void pop_back() { V.pop_back(); } // FIXME: insert() takes const_iterator since C++11, old libstdc++ disagrees. - iterator insert(iterator P, const Value &E); - iterator insert(iterator P, Value &&E); - template iterator insert(iterator P, It A, It Z); - template iterator emplace(const_iterator P, Args &&...A); + iterator insert(iterator P, const Value &E) { return V.insert(P, E); } + iterator insert(iterator P, Value &&E) { + return V.insert(P, std::move(E)); + } + template iterator insert(iterator P, It A, It Z) { + return V.insert(P, A, Z); + } + template iterator emplace(const_iterator P, Args &&... A) { + return V.emplace(P, std::forward(A)...); + } - friend bool operator==(const Array &L, const Array &R); + friend bool operator==(const Array &L, const Array &R) { return L.V == R.V; } }; inline bool operator!=(const Array &L, const Array &R) { return !(L == R); } @@ -507,48 +515,6 @@ bool operator==(const Value &, const Value &); inline bool operator!=(const Value &L, const Value &R) { return !(L == R); } -// Array Methods -inline Value &Array::operator[](size_t I) { return V[I]; } -inline const Value &Array::operator[](size_t I) const { return V[I]; } -inline Value &Array::front() { return V.front(); } -inline const Value &Array::front() const { return V.front(); } -inline Value &Array::back() { return V.back(); } -inline const Value &Array::back() const { return V.back(); } -inline Value *Array::data() { return V.data(); } -inline const Value *Array::data() const { return V.data(); } - -inline typename Array::iterator Array::begin() { return V.begin(); } -inline typename Array::const_iterator Array::begin() const { return V.begin(); } -inline typename Array::iterator Array::end() { return V.end(); } -inline typename Array::const_iterator Array::end() const { return V.end(); } - -inline bool Array::empty() const { return V.empty(); } -inline size_t Array::size() const { return V.size(); } -inline void Array::reserve(size_t S) { V.reserve(S); } - -inline void Array::clear() { V.clear(); } -inline void Array::push_back(const Value &E) { V.push_back(E); } -inline void Array::push_back(Value &&E) { V.push_back(std::move(E)); } -template inline void Array::emplace_back(Args &&...A) { - V.emplace_back(std::forward(A)...); -} -inline void Array::pop_back() { V.pop_back(); } -inline typename Array::iterator Array::insert(iterator P, const Value &E) { - return V.insert(P, E); -} -inline typename Array::iterator Array::insert(iterator P, Value &&E) { - return V.insert(P, std::move(E)); -} -template -inline typename Array::iterator Array::insert(iterator P, It A, It Z) { - return V.insert(P, A, Z); -} -template -inline typename Array::iterator Array::emplace(const_iterator P, Args &&...A) { - return V.emplace(P, std::forward(A)...); -} -inline bool operator==(const Array &L, const Array &R) { return L.V == R.V; } - /// ObjectKey is a used to capture keys in Object. Like Value but: /// - only strings are allowed /// - it's optimized for the string literal case (Owned == nullptr) Index: llvm/lib/Analysis/LoopInfo.cpp =================================================================== --- llvm/lib/Analysis/LoopInfo.cpp +++ llvm/lib/Analysis/LoopInfo.cpp @@ -425,12 +425,12 @@ // Check that 'BB' doesn't have any uses outside of the 'L' static bool isBlockInLCSSAForm(const Loop &L, const BasicBlock &BB, - const DominatorTree &DT, bool IgnoreTokens) { + const DominatorTree &DT) { for (const Instruction &I : BB) { // Tokens can't be used in PHI nodes and live-out tokens prevent loop // optimizations, so for the purposes of considered LCSSA form, we // can ignore them. - if (IgnoreTokens && I.getType()->isTokenTy()) + if (I.getType()->isTokenTy()) continue; for (const Use &U : I.uses()) { @@ -455,20 +455,20 @@ return true; } -bool Loop::isLCSSAForm(const DominatorTree &DT, bool IgnoreTokens) const { +bool Loop::isLCSSAForm(const DominatorTree &DT) const { // For each block we check that it doesn't have any uses outside of this loop. return all_of(this->blocks(), [&](const BasicBlock *BB) { - return isBlockInLCSSAForm(*this, *BB, DT, IgnoreTokens); + return isBlockInLCSSAForm(*this, *BB, DT); }); } -bool Loop::isRecursivelyLCSSAForm(const DominatorTree &DT, const LoopInfo &LI, - bool IgnoreTokens) const { +bool Loop::isRecursivelyLCSSAForm(const DominatorTree &DT, + const LoopInfo &LI) const { // For each block we check that it doesn't have any uses outside of its // innermost loop. This process will transitively guarantee that the current // loop and all of the nested loops are in LCSSA form. return all_of(this->blocks(), [&](const BasicBlock *BB) { - return isBlockInLCSSAForm(*LI.getLoopFor(BB), *BB, DT, IgnoreTokens); + return isBlockInLCSSAForm(*LI.getLoopFor(BB), *BB, DT); }); } Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23466,10 +23466,14 @@ int Index0, Index1; SDValue Src0 = DAG.getSplatSourceVector(N0, Index0); SDValue Src1 = DAG.getSplatSourceVector(N1, Index1); + // Extract element from splat_vector should be free. + // TODO: use DAG.isSplatValue instead? + bool IsBothSplatVector = N0.getOpcode() == ISD::SPLAT_VECTOR && + N1.getOpcode() == ISD::SPLAT_VECTOR; if (!Src0 || !Src1 || Index0 != Index1 || Src0.getValueType().getVectorElementType() != EltVT || Src1.getValueType().getVectorElementType() != EltVT || - !TLI.isExtractVecEltCheap(VT, Index0) || + !(IsBothSplatVector || TLI.isExtractVecEltCheap(VT, Index0)) || !TLI.isOperationLegalOrCustom(Opcode, EltVT)) return SDValue(); @@ -23491,6 +23495,8 @@ } // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index + if (VT.isScalableVector()) + return DAG.getSplatVector(VT, DL, ScalarBO); SmallVector Ops(VT.getVectorNumElements(), ScalarBO); return DAG.getBuildVector(VT, DL, Ops); } Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1563,10 +1563,8 @@ setOperationAction(ISD::FP_EXTEND, VT, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom); } - for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { + for (unsigned Opc : { ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV }) setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32); - setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32); - } setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); @@ -1877,10 +1875,8 @@ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); - for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { + for (unsigned Opc : { ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV }) setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32); - setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32); - } for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::MLOAD, VT, Legal); Index: llvm/test/CodeGen/RISCV/rvv/vadd-sdnode.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/vadd-sdnode.ll +++ llvm/test/CodeGen/RISCV/rvv/vadd-sdnode.ll @@ -879,9 +879,9 @@ ; ; RV64-LABEL: vadd_xx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vadd.vx v8, v8, a1 ; RV64-NEXT: ret %head1 = insertelement poison, i64 %a, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer Index: llvm/test/CodeGen/RISCV/rvv/vand-sdnode.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/vand-sdnode.ll +++ llvm/test/CodeGen/RISCV/rvv/vand-sdnode.ll @@ -1370,9 +1370,9 @@ ; ; RV64-LABEL: vand_xx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: ret %head1 = insertelement poison, i64 %a, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer Index: llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll +++ llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64NOM ; RUN: llc -mtriple=riscv32 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64M define @vmul_vv_nxv1i8( %va, %vb) { ; CHECK-LABEL: vmul_vv_nxv1i8: @@ -939,12 +939,19 @@ ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; -; RV64-LABEL: vmul_xx_nxv8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu -; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: ret +; RV64NOM-LABEL: vmul_xx_nxv8i64: +; RV64NOM: # %bb.0: +; RV64NOM-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64NOM-NEXT: vmv.v.x v8, a0 +; RV64NOM-NEXT: vmul.vx v8, v8, a1 +; RV64NOM-NEXT: ret +; +; RV64M-LABEL: vmul_xx_nxv8i64: +; RV64M: # %bb.0: +; RV64M-NEXT: mul a0, a0, a1 +; RV64M-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64M-NEXT: vmv.v.x v8, a0 +; RV64M-NEXT: ret %head1 = insertelement poison, i64 %a, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer %head2 = insertelement poison, i64 %b, i32 0 Index: llvm/test/CodeGen/RISCV/rvv/vor-sdnode.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/vor-sdnode.ll +++ llvm/test/CodeGen/RISCV/rvv/vor-sdnode.ll @@ -1163,9 +1163,9 @@ ; ; RV64-LABEL: vor_xx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vor.vx v8, v8, a1 ; RV64-NEXT: ret %head1 = insertelement poison, i64 %a, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer Index: llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll +++ llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll @@ -857,9 +857,9 @@ ; ; RV64-LABEL: vsub_xx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vsub.vx v8, v8, a1 ; RV64-NEXT: ret %head1 = insertelement poison, i64 %a, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer Index: llvm/test/CodeGen/RISCV/rvv/vxor-sdnode.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/vxor-sdnode.ll +++ llvm/test/CodeGen/RISCV/rvv/vxor-sdnode.ll @@ -1370,9 +1370,9 @@ ; ; RV64-LABEL: vxor_xx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vxor.vx v8, v8, a1 ; RV64-NEXT: ret %head1 = insertelement poison, i64 %a, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer Index: llvm/test/CodeGen/X86/avx512-f16c-v16f16-fadd.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-f16c-v16f16-fadd.ll +++ /dev/null @@ -1,27 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -mattr=f16c| FileCheck %s --check-prefixes=CHECK - -define <16 x half> @foo(<16 x half> %a, <16 x half> %b) nounwind { -; CHECK-LABEL: foo: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: andq $-32, %rsp -; CHECK-NEXT: subq $96, %rsp -; CHECK-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm0, (%rsp) -; CHECK-NEXT: vcvtph2ps {{[0-9]+}}(%rsp), %ymm0 -; CHECK-NEXT: vcvtph2ps (%rsp), %ymm1 -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-NEXT: vcvtph2ps {{[0-9]+}}(%rsp), %ymm1 -; CHECK-NEXT: vcvtph2ps {{[0-9]+}}(%rsp), %ymm2 -; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 -; CHECK-NEXT: vcvtps2ph $4, %ymm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: movq %rbp, %rsp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: retq - %1 = fadd <16 x half> %a, %b - ret <16 x half> %1 -} Index: llvm/test/CodeGen/X86/avx512-skx-v32f16-fadd.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-skx-v32f16-fadd.ll +++ /dev/null @@ -1,27 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512f| FileCheck %s --check-prefixes=CHECK - -define <32 x half> @foo(<32 x half> %a, <32 x half> %b) nounwind { -; CHECK-LABEL: foo: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: andq $-64, %rsp -; CHECK-NEXT: subq $192, %rsp -; CHECK-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %zmm0, (%rsp) -; CHECK-NEXT: vcvtph2ps {{[0-9]+}}(%rsp), %zmm0 -; CHECK-NEXT: vcvtph2ps (%rsp), %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; CHECK-NEXT: vcvtph2ps {{[0-9]+}}(%rsp), %zmm1 -; CHECK-NEXT: vcvtph2ps {{[0-9]+}}(%rsp), %zmm2 -; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 -; CHECK-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: movq %rbp, %rsp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: retq - %1 = fadd <32 x half> %a, %b - ret <32 x half> %1 -} Index: llvm/unittests/Analysis/LoopInfoTest.cpp =================================================================== --- llvm/unittests/Analysis/LoopInfoTest.cpp +++ llvm/unittests/Analysis/LoopInfoTest.cpp @@ -1584,66 +1584,3 @@ EXPECT_EQ(L->getInductionVariable(SE)->getName(), "count.07"); }); } - -// Test that we correctly identify tokens breaching LCSSA form. -TEST(LoopInfoTest, TokenLCSSA) { - const char *ModuleStr = - "define void @test() gc \"statepoint-example\" {\n" - "entry:\n" - " br label %outer_loop\n" - "outer_loop:\n" - " br label %inner_loop\n" - "inner_loop:\n" - " %token = call token (i64, i32, i8 addrspace(1)* (i64, i32, i32, " - "i32)*, i32, i32, ...) " - "@llvm.experimental.gc.statepoint.p0f_p1i8i64i32i32i32f(i64 2882400000, " - "i32 0, i8 addrspace(1)* (i64, i32, i32, i32)* nonnull elementtype(i8 " - "addrspace(1)* (i64, i32, i32, i32)) @foo, i32 4, i32 0, i64 undef, i32 " - "5, i32 5, i32 undef, i32 0, i32 0) [ \"deopt\"(), \"gc-live\"(i8 " - "addrspace(1)* undef) ]\n" - " br i1 undef, label %inner_loop, label %outer_backedge\n" - "outer_backedge:\n" - " br i1 undef, label %outer_loop, label %exit\n" - "exit:\n" - " %tmp35 = call coldcc i8 addrspace(1)* " - "@llvm.experimental.gc.relocate.p1i8(token %token, i32 0, i32 0) ; " - "(undef, undef)\n" - " ret void\n" - "}\n" - "declare i8 addrspace(1)* @foo(i64, i32, i32, i32)\n" - "declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32 " - "immarg, i32 immarg) #0\n" - "declare token " - "@llvm.experimental.gc.statepoint.p0f_p1i8i64i32i32i32f(i64 immarg, i32 " - "immarg, i8 addrspace(1)* (i64, i32, i32, i32)*, i32 immarg, i32 immarg, " - "...)\n" - "attributes #0 = { nounwind readnone }\n"; - - // Parse the module. - LLVMContext Context; - std::unique_ptr M = makeLLVMModule(Context, ModuleStr); - - runWithLoopInfoPlus(*M, "test", - [&](Function &F, LoopInfo &LI, ScalarEvolution &SE) { - Function::iterator FI = F.begin(); - BasicBlock *OuterHeader = &*(++FI); - Loop *OuterLoop = LI.getLoopFor(OuterHeader); - BasicBlock *InnerHeader = &*(++FI); - Loop *InnerLoop = LI.getLoopFor(InnerHeader); - EXPECT_NE(OuterLoop, nullptr); - EXPECT_NE(InnerLoop, nullptr); - DominatorTree DT(F); - EXPECT_TRUE(OuterLoop->isLCSSAForm(DT, /*IgnoreTokens*/ true)); - EXPECT_FALSE(OuterLoop->isLCSSAForm(DT, /*IgnoreTokens*/ false)); - EXPECT_TRUE(InnerLoop->isLCSSAForm(DT, /*IgnoreTokens*/ true)); - EXPECT_FALSE(InnerLoop->isLCSSAForm(DT, /*IgnoreTokens*/ false)); - EXPECT_TRUE( - OuterLoop->isRecursivelyLCSSAForm(DT, LI, /*IgnoreTokens*/ true)); - EXPECT_FALSE( - OuterLoop->isRecursivelyLCSSAForm(DT, LI, /*IgnoreTokens*/ false)); - EXPECT_TRUE( - InnerLoop->isRecursivelyLCSSAForm(DT, LI, /*IgnoreTokens*/ true)); - EXPECT_FALSE( - InnerLoop->isRecursivelyLCSSAForm(DT, LI, /*IgnoreTokens*/ false)); - }); -} Index: llvm/utils/gn/secondary/libcxx/include/BUILD.gn =================================================================== --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -169,7 +169,6 @@ "__algorithm/ranges_is_partitioned.h", "__algorithm/ranges_is_sorted.h", "__algorithm/ranges_is_sorted_until.h", - "__algorithm/ranges_iterator_concept.h", "__algorithm/ranges_lexicographical_compare.h", "__algorithm/ranges_lower_bound.h", "__algorithm/ranges_make_heap.h",