Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add truncf16 and truncf128 #427

Merged
merged 2 commits into from
Jan 13, 2025
Merged

Conversation

tgross35
Copy link
Contributor

Add a generic version of trunc, then use it to provide the algorithm for f16 and f128.

@tgross35
Copy link
Contributor Author

Testing locally on arm64, benchmarks are within noise tolerance for the f32 and f64 versions.

@tgross35 tgross35 force-pushed the generic-trunc branch 3 times, most recently from 9a54949 to 035bfbb Compare January 13, 2025 09:47
The algorithm is identical for both types, so this is a straightforward
routine to port.
@tgross35
Copy link
Contributor Author

Just for reference, asm on master:

.section .text.libm::math::trunc::trunc,"ax",@progbits
        .globl  libm::math::trunc::trunc
        .p2align        4, 0x90
.type   libm::math::trunc::trunc,@function
libm::math::trunc::trunc:
        .cfi_startproc
        movq rax, xmm0
        mov rcx, rax
        shr rcx, 52
        mov edx, ecx
        and edx, 2047
        cmp rdx, 1074
        ja .LBB109_3
        add cl, 13
        mov rsi, -1
        shr rsi, cl
        cmp edx, 1023
        movabs rcx, 9223372036854775807
        cmovae rcx, rsi
        test rcx, rax
        je .LBB109_3
        addsd xmm0, qword ptr [rip + .LCPI109_0]
        movsd qword ptr [rsp - 8], xmm0
        movsd xmm0, qword ptr [rsp - 8]
        not rcx
        and rcx, rax
        movq xmm0, rcx
.LBB109_3:
        ret

.section .text.libm::math::truncf::truncf,"ax",@progbits
        .globl  libm::math::truncf::truncf
        .p2align        4, 0x90
.type   libm::math::truncf::truncf,@function
libm::math::truncf::truncf:
        .cfi_startproc
        movd eax, xmm0
        mov ecx, eax
        shr ecx, 23
        movzx edx, cl
        cmp edx, 149
        ja .LBB110_3
        add cl, 10
        mov esi, -1
        shr esi, cl
        cmp edx, 127
        mov ecx, 2147483647
        cmovae ecx, esi
        test ecx, eax
        je .LBB110_3
        addss xmm0, dword ptr [rip + .LCPI110_0]
        movss dword ptr [rsp - 4], xmm0
        movss xmm0, dword ptr [rsp - 4]
        not ecx
        and ecx, eax
        movd xmm0, ecx
.LBB110_3:
        ret

And with this PR:

.section .text.libm::math::trunc::trunc,"ax",@progbits
        .globl  libm::math::trunc::trunc
        .p2align        4, 0x90
.type   libm::math::trunc::trunc,@function
libm::math::trunc::trunc:
        .cfi_startproc
        movq rax, xmm0
        mov rdx, rax
        shr rdx, 52
        and edx, 2047
        cmp edx, 1074
        ja .LBB109_3
        lea ecx, [rdx - 1023]
        movabs rsi, -4503599627370496
        sar rsi, cl
        cmp edx, 1023
        movabs rcx, -9223372036854775808
        cmovae rcx, rsi
        mov rdx, rcx
        not rdx
        test rdx, rax
        je .LBB109_3
        addsd xmm0, qword ptr [rip + .LCPI109_0]
        movsd qword ptr [rsp - 8], xmm0
        movsd xmm0, qword ptr [rsp - 8]
        and rcx, rax
        movq xmm0, rcx
.LBB109_3:
        ret

.section .text.libm::math::truncf::truncf,"ax",@progbits
        .globl  libm::math::truncf::truncf
        .p2align        4, 0x90
.type   libm::math::truncf::truncf,@function
libm::math::truncf::truncf:
        .cfi_startproc
        movd eax, xmm0
        mov ecx, eax
        shr ecx, 23
        movzx edx, cl
        cmp edx, 149
        ja .LBB110_3
        lea ecx, [rdx - 127]
        mov esi, -8388608
        sar esi, cl
        cmp edx, 127
        mov ecx, -2147483648
        cmovae ecx, esi
        mov edx, ecx
        not edx
        test edx, eax
        je .LBB110_3
        addss xmm0, dword ptr [rip + .LCPI110_0]
        movss dword ptr [rsp - 4], xmm0
        movss xmm0, dword ptr [rsp - 4]
        and ecx, eax
        movd xmm0, ecx
.LBB110_3:
        ret

.section .text.libm::math::truncf16::truncf16,"ax",@progbits
        .globl  libm::math::truncf16::truncf16
        .p2align        4, 0x90
.type   libm::math::truncf16::truncf16,@function
libm::math::truncf16::truncf16:
        .cfi_startproc
        push rbp
        .cfi_def_cfa_offset 16
        push rbx
        .cfi_def_cfa_offset 24
        push rax
        .cfi_def_cfa_offset 32
        .cfi_offset rbx, -24
        .cfi_offset rbp, -16
        pextrw ebx, xmm0, 0
        mov eax, ebx
        shr eax, 10
        and eax, 31
        cmp ax, 24
        ja .LBB186_3
        mov ecx, eax
        sub cx, 15
        mov edx, -1024
        sar edx, cl
        cmp ax, 15
        mov ebp, 32768
        cmovae ebp, edx
        mov eax, ebp
        not eax
        test ax, bx
        je .LBB186_3
        call qword ptr [rip + __extendhfsf2@GOTPCREL]
        addss xmm0, dword ptr [rip + .LCPI186_0]
        call qword ptr [rip + __truncsfhf2@GOTPCREL]
        pextrw eax, xmm0, 0
        mov word ptr [rsp + 6], ax
        pinsrw xmm0, word ptr [rsp + 6], 0
        and ebp, ebx
        pinsrw xmm0, ebp, 0
.LBB186_3:
        add rsp, 8
        .cfi_def_cfa_offset 24
        pop rbx
        .cfi_def_cfa_offset 16
        pop rbp
        .cfi_def_cfa_offset 8
        ret

.section .text.libm::math::truncf128::truncf128,"ax",@progbits
        .globl  libm::math::truncf128::truncf128
        .p2align        4, 0x90
.type   libm::math::truncf128::truncf128,@function
libm::math::truncf128::truncf128:
        .cfi_startproc
        push r15
        .cfi_def_cfa_offset 16
        push r14
        .cfi_def_cfa_offset 24
        push r12
        .cfi_def_cfa_offset 32
        push rbx
        .cfi_def_cfa_offset 40
        sub rsp, 56
        .cfi_def_cfa_offset 96
        .cfi_offset rbx, -40
        .cfi_offset r12, -32
        .cfi_offset r14, -24
        .cfi_offset r15, -16
        movaps xmmword ptr [rsp + 16], xmm0
        mov rbx, qword ptr [rsp + 24]
        mov rax, rbx
        shr rax, 48
        and eax, 32767
        cmp eax, 16494
        ja .LBB187_3
        lea ecx, [rax - 16383]
        movabs rdx, -281474976710656
        mov rsi, rdx
        sar rsi, cl
        mov edi, ecx
        and dil, 64
        xor r8d, r8d
        mov r9d, edi
        neg r9b
        mov r9d, 0
        sbb r9, r9
        or r9, rsi
        cmp eax, 16383
        movabs r14, -9223372036854775808
        cmovae r14, r9
        xor r15d, r15d
        shrd r15, rdx, cl
        test dil, dil
        cmovne r15, rsi
        cmp eax, 16383
        cmovb r15, r8
        mov r12, qword ptr [rsp + 16]
        mov rax, r14
        not rax
        mov rcx, r15
        not rcx
        and rcx, r12
        and rax, rbx
        or rax, rcx
        je .LBB187_3
        movaps xmm1, xmmword ptr [rip + .LCPI187_0]
        call qword ptr [rip + __addtf3@GOTPCREL]
        movaps xmmword ptr [rsp + 32], xmm0
        movaps xmm0, xmmword ptr [rsp + 32]
        and r15, r12
        mov qword ptr [rsp], r15
        and r14, rbx
        mov qword ptr [rsp + 8], r14
        movaps xmm0, xmmword ptr [rsp]
.LBB187_3:
        add rsp, 56
        .cfi_def_cfa_offset 40
        pop rbx
        .cfi_def_cfa_offset 32
        pop r12
        .cfi_def_cfa_offset 24
        pop r14
        .cfi_def_cfa_offset 16
        pop r15
        .cfi_def_cfa_offset 8
        ret

Very comparable. It is pretty unfortunate that the force_eval call for f16 adds two libcalls. We probably don't really need this, but I'll look into fpenv at a later time.

@tgross35
Copy link
Contributor Author

Note to self: rust-lang/rust#133050 means f16 and f128 symbols never show up in rlibs, so they need to be temporarily marked #[inline(never)] before running cargo asm.

Use the generic algorithms to provide implementations for these
routines.
@tgross35 tgross35 enabled auto-merge January 13, 2025 10:26
@tgross35 tgross35 merged commit 01384e8 into rust-lang:master Jan 13, 2025
35 checks passed
@tgross35 tgross35 deleted the generic-trunc branch January 13, 2025 10:33
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant