/* * Copyright 2018-2020 Yury Gribov * * The MIT License (MIT) * * Use of this source code is governed by MIT license that can be * found in the LICENSE.txt file. */ .data .globl _libnvidia_encode_so_tramp_table .hidden _libnvidia_encode_so_tramp_table .align 8 _libnvidia_encode_so_tramp_table: .zero 24 .text .globl _libnvidia_encode_so_tramp_resolve .hidden _libnvidia_encode_so_tramp_resolve .globl _libnvidia_encode_so_save_regs_and_resolve .hidden _libnvidia_encode_so_save_regs_and_resolve .type _libnvidia_encode_so_save_regs_and_resolve, %function _libnvidia_encode_so_save_regs_and_resolve: .cfi_startproc #define PUSH_REG(reg) pushq %reg ; .cfi_adjust_cfa_offset 8; .cfi_rel_offset reg, 0 #define POP_REG(reg) popq %reg ; .cfi_adjust_cfa_offset -8; .cfi_restore reg #define DEC_STACK(d) subq $d, %rsp; .cfi_adjust_cfa_offset d #define INC_STACK(d) addq $d, %rsp; .cfi_adjust_cfa_offset -d #define PUSH_XMM_REG(reg) DEC_STACK(16); movdqa %reg, (%rsp); .cfi_rel_offset reg, 0 #define POP_XMM_REG(reg) movdqa (%rsp), %reg; .cfi_restore reg; INC_STACK(16) // Slow path which calls dlsym, taken only on first call. // All registers are stored to handle arbitrary calling conventions // (except x87 FPU registers which do not have to be preserved). // For Dwarf directives, read https://www.imperialviolet.org/2017/01/18/cfi.html. // FIXME: AVX (YMM, ZMM) registers are NOT saved to simplify code. PUSH_REG(rdi) // 16 mov 0x10(%rsp), %rdi PUSH_REG(rax) PUSH_REG(rbx) // 16 PUSH_REG(rcx) PUSH_REG(rdx) // 16 PUSH_REG(rbp) PUSH_REG(rsi) // 16 PUSH_REG(r8) PUSH_REG(r9) // 16 PUSH_REG(r10) PUSH_REG(r11) // 16 PUSH_REG(r12) PUSH_REG(r13) // 16 PUSH_REG(r14) PUSH_REG(r15) // 16 PUSH_XMM_REG(xmm0) PUSH_XMM_REG(xmm1) PUSH_XMM_REG(xmm2) PUSH_XMM_REG(xmm3) PUSH_XMM_REG(xmm4) PUSH_XMM_REG(xmm5) PUSH_XMM_REG(xmm6) PUSH_XMM_REG(xmm7) // Stack is just 8-byte aligned but callee will re-align to 16 call _libnvidia_encode_so_tramp_resolve POP_XMM_REG(xmm7) POP_XMM_REG(xmm6) POP_XMM_REG(xmm5) POP_XMM_REG(xmm4) POP_XMM_REG(xmm3) POP_XMM_REG(xmm2) POP_XMM_REG(xmm1) POP_XMM_REG(xmm0) // 16 POP_REG(r15) POP_REG(r14) // 16 POP_REG(r13) POP_REG(r12) // 16 POP_REG(r11) POP_REG(r10) // 16 POP_REG(r9) POP_REG(r8) // 16 POP_REG(rsi) POP_REG(rbp) // 16 POP_REG(rdx) POP_REG(rcx) // 16 POP_REG(rbx) POP_REG(rax) // 16 POP_REG(rdi) ret .cfi_endproc /* * Copyright 2018-2019 Yury Gribov * * The MIT License (MIT) * * Use of this source code is governed by MIT license that can be * found in the LICENSE.txt file. */ .globl NvEncodeAPICreateInstance .p2align 4 .type NvEncodeAPICreateInstance, %function NvEncodeAPICreateInstance: .cfi_startproc // Intel opt. manual says to // "make the fall-through code following a conditional branch be the likely target for a branch with a forward target" // to hint static predictor. cmpq $0, _libnvidia_encode_so_tramp_table+0(%rip) je 2f 1: jmp *_libnvidia_encode_so_tramp_table+0(%rip) 2: pushq $0 .cfi_adjust_cfa_offset 8 call _libnvidia_encode_so_save_regs_and_resolve addq $8, %rsp .cfi_adjust_cfa_offset -8 jmp 1b .cfi_endproc /* * Copyright 2018-2019 Yury Gribov * * The MIT License (MIT) * * Use of this source code is governed by MIT license that can be * found in the LICENSE.txt file. */ .globl NvEncodeAPIGetMaxSupportedVersion .p2align 4 .type NvEncodeAPIGetMaxSupportedVersion, %function NvEncodeAPIGetMaxSupportedVersion: .cfi_startproc // Intel opt. manual says to // "make the fall-through code following a conditional branch be the likely target for a branch with a forward target" // to hint static predictor. cmpq $0, _libnvidia_encode_so_tramp_table+8(%rip) je 2f 1: jmp *_libnvidia_encode_so_tramp_table+8(%rip) 2: pushq $1 .cfi_adjust_cfa_offset 8 call _libnvidia_encode_so_save_regs_and_resolve addq $8, %rsp .cfi_adjust_cfa_offset -8 jmp 1b .cfi_endproc