Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AArch64: Add NEON fp16 intrinsics #1726

Merged
merged 6 commits into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crates/core_arch/src/aarch64/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! AArch64 intrinsics.
//!
//! The reference for NEON is [ARM's NEON Intrinsics Reference][arm_ref]. The
//! [ARM's NEON Intrinsics Online Database][arm_dat] is also useful.
//! The reference for NEON is [Arm's NEON Intrinsics Reference][arm_ref]. The
//! [Arm's NEON Intrinsics Online Database][arm_dat] is also useful.
//!
//! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
//! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
Expand Down
4,350 changes: 4,242 additions & 108 deletions crates/core_arch/src/aarch64/neon/generated.rs

Large diffs are not rendered by default.

6,480 changes: 6,208 additions & 272 deletions crates/core_arch/src/arm_shared/neon/generated.rs

Large diffs are not rendered by default.

282 changes: 182 additions & 100 deletions crates/core_arch/src/arm_shared/neon/mod.rs

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions crates/core_arch/src/simd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ simd_ty!(
simd_ty!(i32x4[i32;4]: x0, x1, x2, x3);
simd_ty!(i64x2[i64;2]: x0, x1);

simd_ty!(f16x4[f16;4]: x0, x1, x2, x3);

simd_ty!(
f16x8[f16;8]:
x0,
Expand Down
8 changes: 7 additions & 1 deletion crates/intrinsic-test/missing_aarch64.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,10 @@ vrnd64zq_f64
vrnd32x_f64
vrnd32z_f64
vrnd64x_f64
vrnd64z_f64
vrnd64z_f64

# Broken in Clang
vcvth_s16_f16
# FIXME: Broken output due to missing f16 printing support in Rust, see git blame for this line
vmulh_lane_f16
vmulh_laneq_f16
137 changes: 136 additions & 1 deletion crates/intrinsic-test/missing_arm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,74 @@ vsri_n_p64
vsriq_n_p64
vtst_p64
vtstq_p64
vaddh_f16
vsubh_f16
vabsh_f16
vdivh_f16
vmulh_f16
vfmsh_f16
vfmah_f16
vminnmh_f16
vmaxnmh_f16
vrndh_f16
vrndnh_f16
vrndih_f16
vrndah_f16
vrndph_f16
vrndmh_f16
vrndxh_f16
vsqrth_f16
vnegh_f16
vcvth_f16_s32
vcvth_s32_f16
vcvth_n_f16_s32
vcvth_n_s32_f16
vcvth_f16_u32
vcvth_u32_f16
vcvth_n_f16_u32
vcvth_n_u32_f16
vcvtah_s32_f16
vcvtah_u32_f16
vcvtmh_s32_f16
vcvtmh_u32_f16
vcvtpq_s16_f16
vcvtpq_u16_f16
vcvtp_s16_f16
vcvtp_u16_f16
vcvtph_s32_f16
vcvtph_u32_f16
vcvtnh_u32_f16
vcvtnh_s32_f16
vfmlsl_low_f16
vfmlslq_low_f16
vfmlsl_high_f16
vfmlslq_high_f16
vfmlsl_lane_high_f16
vfmlsl_laneq_high_f16
vfmlslq_lane_high_f16
vfmlslq_laneq_high_f16
vfmlsl_lane_low_f16
vfmlsl_laneq_low_f16
vfmlslq_lane_low_f16
vfmlslq_laneq_low_f16
vfmlal_low_f16
vfmlalq_low_f16
vfmlal_high_f16
vfmlalq_high_f16
vfmlal_lane_low_f16
vfmlal_laneq_low_f16
vfmlalq_lane_low_f16
vfmlalq_laneq_low_f16
vfmlal_lane_high_f16
vfmlal_laneq_high_f16
vfmlalq_lane_high_f16
vfmlalq_laneq_high_f16
vreinterpret_f16_p64
vreinterpretq_f16_p64
vreinterpret_p64_f16
vreinterpretq_p64_f16
vreinterpret_p128_f16
vreinterpretq_p128_f16

# Present in Clang header but triggers an ICE due to lack of backend support.
vcmla_f32
Expand All @@ -134,6 +202,31 @@ vcmlaq_rot270_laneq_f32
vcmlaq_rot90_f32
vcmlaq_rot90_lane_f32
vcmlaq_rot90_laneq_f32
vcmla_f16
vcmlaq_f16
vcmla_laneq_f16
vcmla_lane_f16
vcmla_laneq_f16
vcmlaq_lane_f16
vcmlaq_laneq_f16
vcmla_rot90_f16
vcmlaq_rot90_f16
vcmla_rot180_f16
vcmlaq_rot180_f16
vcmla_rot270_f16
vcmlaq_rot270_f16
vcmla_rot90_lane_f16
vcmla_rot90_laneq_f16
vcmlaq_rot90_lane_f16
vcmlaq_rot90_laneq_f16
vcmla_rot180_lane_f16
vcmla_rot180_laneq_f16
vcmlaq_rot180_lane_f16
vcmlaq_rot180_laneq_f16
vcmla_rot270_lane_f16
vcmla_rot270_laneq_f16
vcmlaq_rot270_lane_f16
vcmlaq_rot270_laneq_f16

# Implemented in stdarch for A64 only, Clang support both A32/A64
vadd_s64
Expand Down Expand Up @@ -182,4 +275,46 @@ vrndpq_f32
vrndq_f32
vrndq_f32
vrndx_f32
vrndxq_f32
vrndxq_f32
vrnda_f16
vrnda_f16
vrndaq_f16
vrndaq_f16
vrnd_f16
vrnd_f16
vrndi_f16
vrndi_f16
vrndiq_f16
vrndiq_f16
vrndm_f16
vrndm_f16
vrndmq_f16
vrndmq_f16
vrndns_f16
vrndp_f16
vrndpq_f16
vrndq_f16
vrndx_f16
vrndxq_f16
vpmin_f16
vpmax_f16
vcaddq_rot270_f16
vcaddq_rot90_f16
vcadd_rot270_f16
vcadd_rot90_f16
vcvtm_s16_f16
vcvtmq_s16_f16
vcvtm_u16_f16
vcvtmq_u16_f16
vcvtaq_s16_f16
vcvtaq_u16_f16
vcvtnq_s16_f16
vcvtnq_u16_f16
vcvtn_s16_f16
vcvtn_u16_f16
vcvtaq_s16_f16
vcvtaq_u16_f16
vcvta_s16_f16
vcvta_u16_f16
vceqz_f16
vceqzq_f16
1 change: 1 addition & 0 deletions crates/intrinsic-test/src/intrinsic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ impl Intrinsic {
format!(
"{promote}cast<{cast}>(__return_value)",
cast = match self.results.kind() {
TypeKind::Float if self.results.inner_size() == 16 => "float16_t".to_string(),
TypeKind::Float if self.results.inner_size() == 32 => "float".to_string(),
TypeKind::Float if self.results.inner_size() == 64 => "double".to_string(),
TypeKind::Int => format!("int{}_t", self.results.inner_size()),
Expand Down
28 changes: 19 additions & 9 deletions crates/intrinsic-test/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,15 @@ std::ostream& operator<<(std::ostream& os, poly128_t value) {{
}}
#endif

std::ostream& operator<<(std::ostream& os, float16_t value) {{
uint16_t temp = 0;
memcpy(&temp, &value, sizeof(float16_t));
std::stringstream ss;
ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp;
os << ss.str();
return os;
}}

{arglists}

int main(int argc, char **argv) {{
Expand Down Expand Up @@ -185,6 +194,7 @@ fn generate_rust_program(notices: &str, intrinsic: &Intrinsic, target: &str) ->
format!(
r#"{notices}#![feature(simd_ffi)]
#![feature(link_llvm_intrinsics)]
#![feature(f16)]
#![cfg_attr(target_arch = "arm", feature(stdarch_arm_neon_intrinsics))]
#![cfg_attr(target_arch = "arm", feature(stdarch_aarch32_crc32))]
#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_fcma))]
Expand All @@ -193,6 +203,7 @@ fn generate_rust_program(notices: &str, intrinsic: &Intrinsic, target: &str) ->
#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_sha3))]
#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_sm4))]
#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_ftts))]
#![feature(stdarch_neon_f16)]
#![allow(non_upper_case_globals)]
use core_arch::arch::{target_arch}::*;

Expand Down Expand Up @@ -227,9 +238,9 @@ fn compile_c(
) -> bool {
let flags = std::env::var("CPPFLAGS").unwrap_or("".into());
let arch_flags = if target.contains("v7") {
"-march=armv8.6-a+crypto+crc+dotprod"
"-march=armv8.6-a+crypto+crc+dotprod+fp16"
} else {
"-march=armv8.6-a+crypto+sha3+crc+dotprod"
"-march=armv8.6-a+crypto+sha3+crc+dotprod+fp16"
};

let intrinsic_name = &intrinsic.name;
Expand Down Expand Up @@ -324,7 +335,12 @@ fn build_c(
let c_filename = format!(r#"c_programs/{}.cpp"#, i.name);
let mut file = File::create(&c_filename).unwrap();

let c_code = generate_c_program(notices, &["arm_neon.h", "arm_acle.h"], i, target);
let c_code = generate_c_program(
notices,
&["arm_neon.h", "arm_acle.h", "arm_fp16.h"],
i,
target,
);
file.write_all(c_code.into_bytes().as_slice()).unwrap();
match compiler {
None => true,
Expand Down Expand Up @@ -512,13 +528,7 @@ fn main() {
// Not sure how we would compare intrinsic that returns void.
.filter(|i| i.results.kind() != TypeKind::Void)
.filter(|i| i.results.kind() != TypeKind::BFloat)
.filter(|i| !(i.results.kind() == TypeKind::Float && i.results.inner_size() == 16))
.filter(|i| !i.arguments.iter().any(|a| a.ty.kind() == TypeKind::BFloat))
.filter(|i| {
!i.arguments
.iter()
.any(|a| a.ty.kind() == TypeKind::Float && a.ty.inner_size() == 16)
})
// Skip pointers for now, we would probably need to look at the return
// type to work out how many elements we need to point to.
.filter(|i| !i.arguments.iter().any(|a| a.is_ptr()))
Expand Down
4 changes: 3 additions & 1 deletion crates/intrinsic-test/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -348,14 +348,16 @@ impl IntrinsicType {
}
IntrinsicType::Type {
kind: TypeKind::Float,
bit_len: Some(bit_len @ (32 | 64)),
bit_len: Some(bit_len @ (16 | 32 | 64)),
simd_len,
vec_len,
..
} => {
let (prefix, cast_prefix, cast_suffix, suffix) = match (language, bit_len) {
(&Language::Rust, 16) => ("[", "f16::from_bits(", ")", "]"),
(&Language::Rust, 32) => ("[", "f32::from_bits(", ")", "]"),
(&Language::Rust, 64) => ("[", "f64::from_bits(", ")", "]"),
(&Language::C, 16) => ("{", "cast<float16_t, uint16_t>(", ")", "}"),
(&Language::C, 32) => ("{", "cast<float, uint32_t>(", ")", "}"),
(&Language::C, 64) => ("{", "cast<double, uint64_t>(", ")", "}"),
_ => unreachable!(),
Expand Down
Loading