123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160 |
- /*
- * Copyright 2018-2020 Yury Gribov
- *
- * The MIT License (MIT)
- *
- * Use of this source code is governed by MIT license that can be
- * found in the LICENSE.txt file.
- */
- .data
- .globl _libnvidia_encode_so_tramp_table
- .hidden _libnvidia_encode_so_tramp_table
- .align 8
- _libnvidia_encode_so_tramp_table:
- .zero 24
- .text
- .globl _libnvidia_encode_so_tramp_resolve
- .hidden _libnvidia_encode_so_tramp_resolve
- .globl _libnvidia_encode_so_save_regs_and_resolve
- .hidden _libnvidia_encode_so_save_regs_and_resolve
- .type _libnvidia_encode_so_save_regs_and_resolve, %function
- _libnvidia_encode_so_save_regs_and_resolve:
- .cfi_startproc
- #define PUSH_REG(reg) pushq %reg ; .cfi_adjust_cfa_offset 8; .cfi_rel_offset reg, 0
- #define POP_REG(reg) popq %reg ; .cfi_adjust_cfa_offset -8; .cfi_restore reg
- #define DEC_STACK(d) subq $d, %rsp; .cfi_adjust_cfa_offset d
- #define INC_STACK(d) addq $d, %rsp; .cfi_adjust_cfa_offset -d
- #define PUSH_XMM_REG(reg) DEC_STACK(16); movdqa %reg, (%rsp); .cfi_rel_offset reg, 0
- #define POP_XMM_REG(reg) movdqa (%rsp), %reg; .cfi_restore reg; INC_STACK(16)
- // Slow path which calls dlsym, taken only on first call.
- // All registers are stored to handle arbitrary calling conventions
- // (except x87 FPU registers which do not have to be preserved).
- // For Dwarf directives, read https://www.imperialviolet.org/2017/01/18/cfi.html.
- // FIXME: AVX (YMM, ZMM) registers are NOT saved to simplify code.
- PUSH_REG(rdi) // 16
- mov 0x10(%rsp), %rdi
- PUSH_REG(rax)
- PUSH_REG(rbx) // 16
- PUSH_REG(rcx)
- PUSH_REG(rdx) // 16
- PUSH_REG(rbp)
- PUSH_REG(rsi) // 16
- PUSH_REG(r8)
- PUSH_REG(r9) // 16
- PUSH_REG(r10)
- PUSH_REG(r11) // 16
- PUSH_REG(r12)
- PUSH_REG(r13) // 16
- PUSH_REG(r14)
- PUSH_REG(r15) // 16
- PUSH_XMM_REG(xmm0)
- PUSH_XMM_REG(xmm1)
- PUSH_XMM_REG(xmm2)
- PUSH_XMM_REG(xmm3)
- PUSH_XMM_REG(xmm4)
- PUSH_XMM_REG(xmm5)
- PUSH_XMM_REG(xmm6)
- PUSH_XMM_REG(xmm7)
- // Stack is just 8-byte aligned but callee will re-align to 16
- call _libnvidia_encode_so_tramp_resolve
- POP_XMM_REG(xmm7)
- POP_XMM_REG(xmm6)
- POP_XMM_REG(xmm5)
- POP_XMM_REG(xmm4)
- POP_XMM_REG(xmm3)
- POP_XMM_REG(xmm2)
- POP_XMM_REG(xmm1)
- POP_XMM_REG(xmm0) // 16
- POP_REG(r15)
- POP_REG(r14) // 16
- POP_REG(r13)
- POP_REG(r12) // 16
- POP_REG(r11)
- POP_REG(r10) // 16
- POP_REG(r9)
- POP_REG(r8) // 16
- POP_REG(rsi)
- POP_REG(rbp) // 16
- POP_REG(rdx)
- POP_REG(rcx) // 16
- POP_REG(rbx)
- POP_REG(rax) // 16
- POP_REG(rdi)
- ret
- .cfi_endproc
- /*
- * Copyright 2018-2019 Yury Gribov
- *
- * The MIT License (MIT)
- *
- * Use of this source code is governed by MIT license that can be
- * found in the LICENSE.txt file.
- */
- .globl NvEncodeAPICreateInstance
- .p2align 4
- .type NvEncodeAPICreateInstance, %function
- NvEncodeAPICreateInstance:
- .cfi_startproc
- // Intel opt. manual says to
- // "make the fall-through code following a conditional branch be the likely target for a branch with a forward target"
- // to hint static predictor.
- cmpq $0, _libnvidia_encode_so_tramp_table+0(%rip)
- je 2f
- 1:
- jmp *_libnvidia_encode_so_tramp_table+0(%rip)
- 2:
- pushq $0
- .cfi_adjust_cfa_offset 8
- call _libnvidia_encode_so_save_regs_and_resolve
- addq $8, %rsp
- .cfi_adjust_cfa_offset -8
- jmp 1b
- .cfi_endproc
- /*
- * Copyright 2018-2019 Yury Gribov
- *
- * The MIT License (MIT)
- *
- * Use of this source code is governed by MIT license that can be
- * found in the LICENSE.txt file.
- */
- .globl NvEncodeAPIGetMaxSupportedVersion
- .p2align 4
- .type NvEncodeAPIGetMaxSupportedVersion, %function
- NvEncodeAPIGetMaxSupportedVersion:
- .cfi_startproc
- // Intel opt. manual says to
- // "make the fall-through code following a conditional branch be the likely target for a branch with a forward target"
- // to hint static predictor.
- cmpq $0, _libnvidia_encode_so_tramp_table+8(%rip)
- je 2f
- 1:
- jmp *_libnvidia_encode_so_tramp_table+8(%rip)
- 2:
- pushq $1
- .cfi_adjust_cfa_offset 8
- call _libnvidia_encode_so_save_regs_and_resolve
- addq $8, %rsp
- .cfi_adjust_cfa_offset -8
- jmp 1b
- .cfi_endproc
|