-
Notifications
You must be signed in to change notification settings - Fork 101
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add truncf16
and truncf128
#427
Conversation
e3c91f1
to
0466ff4
Compare
Testing locally on arm64, benchmarks are within noise tolerance for the |
9a54949
to
035bfbb
Compare
The algorithm is identical for both types, so this is a straightforward routine to port.
035bfbb
to
9d39102
Compare
Just for reference, asm on master: .section .text.libm::math::trunc::trunc,"ax",@progbits
.globl libm::math::trunc::trunc
.p2align 4, 0x90
.type libm::math::trunc::trunc,@function
libm::math::trunc::trunc:
.cfi_startproc
movq rax, xmm0
mov rcx, rax
shr rcx, 52
mov edx, ecx
and edx, 2047
cmp rdx, 1074
ja .LBB109_3
add cl, 13
mov rsi, -1
shr rsi, cl
cmp edx, 1023
movabs rcx, 9223372036854775807
cmovae rcx, rsi
test rcx, rax
je .LBB109_3
addsd xmm0, qword ptr [rip + .LCPI109_0]
movsd qword ptr [rsp - 8], xmm0
movsd xmm0, qword ptr [rsp - 8]
not rcx
and rcx, rax
movq xmm0, rcx
.LBB109_3:
ret
.section .text.libm::math::truncf::truncf,"ax",@progbits
.globl libm::math::truncf::truncf
.p2align 4, 0x90
.type libm::math::truncf::truncf,@function
libm::math::truncf::truncf:
.cfi_startproc
movd eax, xmm0
mov ecx, eax
shr ecx, 23
movzx edx, cl
cmp edx, 149
ja .LBB110_3
add cl, 10
mov esi, -1
shr esi, cl
cmp edx, 127
mov ecx, 2147483647
cmovae ecx, esi
test ecx, eax
je .LBB110_3
addss xmm0, dword ptr [rip + .LCPI110_0]
movss dword ptr [rsp - 4], xmm0
movss xmm0, dword ptr [rsp - 4]
not ecx
and ecx, eax
movd xmm0, ecx
.LBB110_3:
ret And with this PR: .section .text.libm::math::trunc::trunc,"ax",@progbits
.globl libm::math::trunc::trunc
.p2align 4, 0x90
.type libm::math::trunc::trunc,@function
libm::math::trunc::trunc:
.cfi_startproc
movq rax, xmm0
mov rdx, rax
shr rdx, 52
and edx, 2047
cmp edx, 1074
ja .LBB109_3
lea ecx, [rdx - 1023]
movabs rsi, -4503599627370496
sar rsi, cl
cmp edx, 1023
movabs rcx, -9223372036854775808
cmovae rcx, rsi
mov rdx, rcx
not rdx
test rdx, rax
je .LBB109_3
addsd xmm0, qword ptr [rip + .LCPI109_0]
movsd qword ptr [rsp - 8], xmm0
movsd xmm0, qword ptr [rsp - 8]
and rcx, rax
movq xmm0, rcx
.LBB109_3:
ret
.section .text.libm::math::truncf::truncf,"ax",@progbits
.globl libm::math::truncf::truncf
.p2align 4, 0x90
.type libm::math::truncf::truncf,@function
libm::math::truncf::truncf:
.cfi_startproc
movd eax, xmm0
mov ecx, eax
shr ecx, 23
movzx edx, cl
cmp edx, 149
ja .LBB110_3
lea ecx, [rdx - 127]
mov esi, -8388608
sar esi, cl
cmp edx, 127
mov ecx, -2147483648
cmovae ecx, esi
mov edx, ecx
not edx
test edx, eax
je .LBB110_3
addss xmm0, dword ptr [rip + .LCPI110_0]
movss dword ptr [rsp - 4], xmm0
movss xmm0, dword ptr [rsp - 4]
and ecx, eax
movd xmm0, ecx
.LBB110_3:
ret
.section .text.libm::math::truncf16::truncf16,"ax",@progbits
.globl libm::math::truncf16::truncf16
.p2align 4, 0x90
.type libm::math::truncf16::truncf16,@function
libm::math::truncf16::truncf16:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
push rbx
.cfi_def_cfa_offset 24
push rax
.cfi_def_cfa_offset 32
.cfi_offset rbx, -24
.cfi_offset rbp, -16
pextrw ebx, xmm0, 0
mov eax, ebx
shr eax, 10
and eax, 31
cmp ax, 24
ja .LBB186_3
mov ecx, eax
sub cx, 15
mov edx, -1024
sar edx, cl
cmp ax, 15
mov ebp, 32768
cmovae ebp, edx
mov eax, ebp
not eax
test ax, bx
je .LBB186_3
call qword ptr [rip + __extendhfsf2@GOTPCREL]
addss xmm0, dword ptr [rip + .LCPI186_0]
call qword ptr [rip + __truncsfhf2@GOTPCREL]
pextrw eax, xmm0, 0
mov word ptr [rsp + 6], ax
pinsrw xmm0, word ptr [rsp + 6], 0
and ebp, ebx
pinsrw xmm0, ebp, 0
.LBB186_3:
add rsp, 8
.cfi_def_cfa_offset 24
pop rbx
.cfi_def_cfa_offset 16
pop rbp
.cfi_def_cfa_offset 8
ret
.section .text.libm::math::truncf128::truncf128,"ax",@progbits
.globl libm::math::truncf128::truncf128
.p2align 4, 0x90
.type libm::math::truncf128::truncf128,@function
libm::math::truncf128::truncf128:
.cfi_startproc
push r15
.cfi_def_cfa_offset 16
push r14
.cfi_def_cfa_offset 24
push r12
.cfi_def_cfa_offset 32
push rbx
.cfi_def_cfa_offset 40
sub rsp, 56
.cfi_def_cfa_offset 96
.cfi_offset rbx, -40
.cfi_offset r12, -32
.cfi_offset r14, -24
.cfi_offset r15, -16
movaps xmmword ptr [rsp + 16], xmm0
mov rbx, qword ptr [rsp + 24]
mov rax, rbx
shr rax, 48
and eax, 32767
cmp eax, 16494
ja .LBB187_3
lea ecx, [rax - 16383]
movabs rdx, -281474976710656
mov rsi, rdx
sar rsi, cl
mov edi, ecx
and dil, 64
xor r8d, r8d
mov r9d, edi
neg r9b
mov r9d, 0
sbb r9, r9
or r9, rsi
cmp eax, 16383
movabs r14, -9223372036854775808
cmovae r14, r9
xor r15d, r15d
shrd r15, rdx, cl
test dil, dil
cmovne r15, rsi
cmp eax, 16383
cmovb r15, r8
mov r12, qword ptr [rsp + 16]
mov rax, r14
not rax
mov rcx, r15
not rcx
and rcx, r12
and rax, rbx
or rax, rcx
je .LBB187_3
movaps xmm1, xmmword ptr [rip + .LCPI187_0]
call qword ptr [rip + __addtf3@GOTPCREL]
movaps xmmword ptr [rsp + 32], xmm0
movaps xmm0, xmmword ptr [rsp + 32]
and r15, r12
mov qword ptr [rsp], r15
and r14, rbx
mov qword ptr [rsp + 8], r14
movaps xmm0, xmmword ptr [rsp]
.LBB187_3:
add rsp, 56
.cfi_def_cfa_offset 40
pop rbx
.cfi_def_cfa_offset 32
pop r12
.cfi_def_cfa_offset 24
pop r14
.cfi_def_cfa_offset 16
pop r15
.cfi_def_cfa_offset 8
ret Very comparable. It is pretty unfortunate that the |
9d39102
to
9daf1a5
Compare
Note to self: rust-lang/rust#133050 means |
Use the generic algorithms to provide implementations for these routines.
9daf1a5
to
aa3f28b
Compare
Add a generic version of
trunc
, then use it to provide the algorithm forf16
andf128
.