Add `truncf16` and `truncf128` #427

tgross35 · 2025-01-12T04:14:35Z

Add a generic version of trunc, then use it to provide the algorithm for f16 and f128.

tgross35 · 2025-01-12T04:34:13Z

Testing locally on arm64, benchmarks are within noise tolerance for the f32 and f64 versions.

The algorithm is identical for both types, so this is a straightforward routine to port.

tgross35 · 2025-01-13T10:02:07Z

Just for reference, asm on master:

.section .text.libm::math::trunc::trunc,"ax",@progbits
        .globl  libm::math::trunc::trunc
        .p2align        4, 0x90
.type   libm::math::trunc::trunc,@function
libm::math::trunc::trunc:
        .cfi_startproc
        movq rax, xmm0
        mov rcx, rax
        shr rcx, 52
        mov edx, ecx
        and edx, 2047
        cmp rdx, 1074
        ja .LBB109_3
        add cl, 13
        mov rsi, -1
        shr rsi, cl
        cmp edx, 1023
        movabs rcx, 9223372036854775807
        cmovae rcx, rsi
        test rcx, rax
        je .LBB109_3
        addsd xmm0, qword ptr [rip + .LCPI109_0]
        movsd qword ptr [rsp - 8], xmm0
        movsd xmm0, qword ptr [rsp - 8]
        not rcx
        and rcx, rax
        movq xmm0, rcx
.LBB109_3:
        ret

.section .text.libm::math::truncf::truncf,"ax",@progbits
        .globl  libm::math::truncf::truncf
        .p2align        4, 0x90
.type   libm::math::truncf::truncf,@function
libm::math::truncf::truncf:
        .cfi_startproc
        movd eax, xmm0
        mov ecx, eax
        shr ecx, 23
        movzx edx, cl
        cmp edx, 149
        ja .LBB110_3
        add cl, 10
        mov esi, -1
        shr esi, cl
        cmp edx, 127
        mov ecx, 2147483647
        cmovae ecx, esi
        test ecx, eax
        je .LBB110_3
        addss xmm0, dword ptr [rip + .LCPI110_0]
        movss dword ptr [rsp - 4], xmm0
        movss xmm0, dword ptr [rsp - 4]
        not ecx
        and ecx, eax
        movd xmm0, ecx
.LBB110_3:
        ret

And with this PR:

.section .text.libm::math::trunc::trunc,"ax",@progbits
        .globl  libm::math::trunc::trunc
        .p2align        4, 0x90
.type   libm::math::trunc::trunc,@function
libm::math::trunc::trunc:
        .cfi_startproc
        movq rax, xmm0
        mov rdx, rax
        shr rdx, 52
        and edx, 2047
        cmp edx, 1074
        ja .LBB109_3
        lea ecx, [rdx - 1023]
        movabs rsi, -4503599627370496
        sar rsi, cl
        cmp edx, 1023
        movabs rcx, -9223372036854775808
        cmovae rcx, rsi
        mov rdx, rcx
        not rdx
        test rdx, rax
        je .LBB109_3
        addsd xmm0, qword ptr [rip + .LCPI109_0]
        movsd qword ptr [rsp - 8], xmm0
        movsd xmm0, qword ptr [rsp - 8]
        and rcx, rax
        movq xmm0, rcx
.LBB109_3:
        ret

.section .text.libm::math::truncf::truncf,"ax",@progbits
        .globl  libm::math::truncf::truncf
        .p2align        4, 0x90
.type   libm::math::truncf::truncf,@function
libm::math::truncf::truncf:
        .cfi_startproc
        movd eax, xmm0
        mov ecx, eax
        shr ecx, 23
        movzx edx, cl
        cmp edx, 149
        ja .LBB110_3
        lea ecx, [rdx - 127]
        mov esi, -8388608
        sar esi, cl
        cmp edx, 127
        mov ecx, -2147483648
        cmovae ecx, esi
        mov edx, ecx
        not edx
        test edx, eax
        je .LBB110_3
        addss xmm0, dword ptr [rip + .LCPI110_0]
        movss dword ptr [rsp - 4], xmm0
        movss xmm0, dword ptr [rsp - 4]
        and ecx, eax
        movd xmm0, ecx
.LBB110_3:
        ret

.section .text.libm::math::truncf16::truncf16,"ax",@progbits
        .globl  libm::math::truncf16::truncf16
        .p2align        4, 0x90
.type   libm::math::truncf16::truncf16,@function
libm::math::truncf16::truncf16:
        .cfi_startproc
        push rbp
        .cfi_def_cfa_offset 16
        push rbx
        .cfi_def_cfa_offset 24
        push rax
        .cfi_def_cfa_offset 32
        .cfi_offset rbx, -24
        .cfi_offset rbp, -16
        pextrw ebx, xmm0, 0
        mov eax, ebx
        shr eax, 10
        and eax, 31
        cmp ax, 24
        ja .LBB186_3
        mov ecx, eax
        sub cx, 15
        mov edx, -1024
        sar edx, cl
        cmp ax, 15
        mov ebp, 32768
        cmovae ebp, edx
        mov eax, ebp
        not eax
        test ax, bx
        je .LBB186_3
        call qword ptr [rip + __extendhfsf2@GOTPCREL]
        addss xmm0, dword ptr [rip + .LCPI186_0]
        call qword ptr [rip + __truncsfhf2@GOTPCREL]
        pextrw eax, xmm0, 0
        mov word ptr [rsp + 6], ax
        pinsrw xmm0, word ptr [rsp + 6], 0
        and ebp, ebx
        pinsrw xmm0, ebp, 0
.LBB186_3:
        add rsp, 8
        .cfi_def_cfa_offset 24
        pop rbx
        .cfi_def_cfa_offset 16
        pop rbp
        .cfi_def_cfa_offset 8
        ret

.section .text.libm::math::truncf128::truncf128,"ax",@progbits
        .globl  libm::math::truncf128::truncf128
        .p2align        4, 0x90
.type   libm::math::truncf128::truncf128,@function
libm::math::truncf128::truncf128:
        .cfi_startproc
        push r15
        .cfi_def_cfa_offset 16
        push r14
        .cfi_def_cfa_offset 24
        push r12
        .cfi_def_cfa_offset 32
        push rbx
        .cfi_def_cfa_offset 40
        sub rsp, 56
        .cfi_def_cfa_offset 96
        .cfi_offset rbx, -40
        .cfi_offset r12, -32
        .cfi_offset r14, -24
        .cfi_offset r15, -16
        movaps xmmword ptr [rsp + 16], xmm0
        mov rbx, qword ptr [rsp + 24]
        mov rax, rbx
        shr rax, 48
        and eax, 32767
        cmp eax, 16494
        ja .LBB187_3
        lea ecx, [rax - 16383]
        movabs rdx, -281474976710656
        mov rsi, rdx
        sar rsi, cl
        mov edi, ecx
        and dil, 64
        xor r8d, r8d
        mov r9d, edi
        neg r9b
        mov r9d, 0
        sbb r9, r9
        or r9, rsi
        cmp eax, 16383
        movabs r14, -9223372036854775808
        cmovae r14, r9
        xor r15d, r15d
        shrd r15, rdx, cl
        test dil, dil
        cmovne r15, rsi
        cmp eax, 16383
        cmovb r15, r8
        mov r12, qword ptr [rsp + 16]
        mov rax, r14
        not rax
        mov rcx, r15
        not rcx
        and rcx, r12
        and rax, rbx
        or rax, rcx
        je .LBB187_3
        movaps xmm1, xmmword ptr [rip + .LCPI187_0]
        call qword ptr [rip + __addtf3@GOTPCREL]
        movaps xmmword ptr [rsp + 32], xmm0
        movaps xmm0, xmmword ptr [rsp + 32]
        and r15, r12
        mov qword ptr [rsp], r15
        and r14, rbx
        mov qword ptr [rsp + 8], r14
        movaps xmm0, xmmword ptr [rsp]
.LBB187_3:
        add rsp, 56
        .cfi_def_cfa_offset 40
        pop rbx
        .cfi_def_cfa_offset 32
        pop r12
        .cfi_def_cfa_offset 24
        pop r14
        .cfi_def_cfa_offset 16
        pop r15
        .cfi_def_cfa_offset 8
        ret

Very comparable. It is pretty unfortunate that the force_eval call for f16 adds two libcalls. We probably don't really need this, but I'll look into fpenv at a later time.

tgross35 · 2025-01-13T10:09:43Z

Note to self: rust-lang/rust#133050 means f16 and f128 symbols never show up in rlibs, so they need to be temporarily marked #[inline(never)] before running cargo asm.

Use the generic algorithms to provide implementations for these routines.

tgross35 force-pushed the generic-trunc branch from e3c91f1 to 0466ff4 Compare January 12, 2025 04:16

tgross35 force-pushed the generic-trunc branch 3 times, most recently from 9a54949 to 035bfbb Compare January 13, 2025 09:47

Add a generic version of trunc

ef493a8

The algorithm is identical for both types, so this is a straightforward routine to port.

tgross35 force-pushed the generic-trunc branch from 035bfbb to 9d39102 Compare January 13, 2025 10:01

tgross35 force-pushed the generic-trunc branch from 9d39102 to 9daf1a5 Compare January 13, 2025 10:02

Add truncf16 and truncf128

aa3f28b

Use the generic algorithms to provide implementations for these routines.

tgross35 force-pushed the generic-trunc branch from 9daf1a5 to aa3f28b Compare January 13, 2025 10:12

tgross35 enabled auto-merge January 13, 2025 10:26

tgross35 merged commit 01384e8 into rust-lang:master Jan 13, 2025
35 checks passed

tgross35 deleted the generic-trunc branch January 13, 2025 10:33

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add `truncf16` and `truncf128` #427

Add `truncf16` and `truncf128` #427

tgross35 commented Jan 12, 2025

tgross35 commented Jan 12, 2025

tgross35 commented Jan 13, 2025

tgross35 commented Jan 13, 2025

Add truncf16 and truncf128 #427

Add truncf16 and truncf128 #427

Conversation

tgross35 commented Jan 12, 2025

tgross35 commented Jan 12, 2025

tgross35 commented Jan 13, 2025

tgross35 commented Jan 13, 2025

Add `truncf16` and `truncf128` #427

Add `truncf16` and `truncf128` #427