libnvidia-encode.so.tramp.S 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. /*
  2. * Copyright 2018-2020 Yury Gribov
  3. *
  4. * The MIT License (MIT)
  5. *
  6. * Use of this source code is governed by MIT license that can be
  7. * found in the LICENSE.txt file.
  8. */
  9. .data
  10. .globl _libnvidia_encode_so_tramp_table
  11. .hidden _libnvidia_encode_so_tramp_table
  12. .align 8
  13. _libnvidia_encode_so_tramp_table:
  14. .zero 24
  15. .text
  16. .globl _libnvidia_encode_so_tramp_resolve
  17. .hidden _libnvidia_encode_so_tramp_resolve
  18. .globl _libnvidia_encode_so_save_regs_and_resolve
  19. .hidden _libnvidia_encode_so_save_regs_and_resolve
  20. .type _libnvidia_encode_so_save_regs_and_resolve, %function
  21. _libnvidia_encode_so_save_regs_and_resolve:
  22. .cfi_startproc
  23. #define PUSH_REG(reg) pushq %reg ; .cfi_adjust_cfa_offset 8; .cfi_rel_offset reg, 0
  24. #define POP_REG(reg) popq %reg ; .cfi_adjust_cfa_offset -8; .cfi_restore reg
  25. #define DEC_STACK(d) subq $d, %rsp; .cfi_adjust_cfa_offset d
  26. #define INC_STACK(d) addq $d, %rsp; .cfi_adjust_cfa_offset -d
  27. #define PUSH_XMM_REG(reg) DEC_STACK(16); movdqa %reg, (%rsp); .cfi_rel_offset reg, 0
  28. #define POP_XMM_REG(reg) movdqa (%rsp), %reg; .cfi_restore reg; INC_STACK(16)
  29. // Slow path which calls dlsym, taken only on first call.
  30. // All registers are stored to handle arbitrary calling conventions
  31. // (except x87 FPU registers which do not have to be preserved).
  32. // For Dwarf directives, read https://www.imperialviolet.org/2017/01/18/cfi.html.
  33. // FIXME: AVX (YMM, ZMM) registers are NOT saved to simplify code.
  34. PUSH_REG(rdi) // 16
  35. mov 0x10(%rsp), %rdi
  36. PUSH_REG(rax)
  37. PUSH_REG(rbx) // 16
  38. PUSH_REG(rcx)
  39. PUSH_REG(rdx) // 16
  40. PUSH_REG(rbp)
  41. PUSH_REG(rsi) // 16
  42. PUSH_REG(r8)
  43. PUSH_REG(r9) // 16
  44. PUSH_REG(r10)
  45. PUSH_REG(r11) // 16
  46. PUSH_REG(r12)
  47. PUSH_REG(r13) // 16
  48. PUSH_REG(r14)
  49. PUSH_REG(r15) // 16
  50. PUSH_XMM_REG(xmm0)
  51. PUSH_XMM_REG(xmm1)
  52. PUSH_XMM_REG(xmm2)
  53. PUSH_XMM_REG(xmm3)
  54. PUSH_XMM_REG(xmm4)
  55. PUSH_XMM_REG(xmm5)
  56. PUSH_XMM_REG(xmm6)
  57. PUSH_XMM_REG(xmm7)
  58. // Stack is just 8-byte aligned but callee will re-align to 16
  59. call _libnvidia_encode_so_tramp_resolve
  60. POP_XMM_REG(xmm7)
  61. POP_XMM_REG(xmm6)
  62. POP_XMM_REG(xmm5)
  63. POP_XMM_REG(xmm4)
  64. POP_XMM_REG(xmm3)
  65. POP_XMM_REG(xmm2)
  66. POP_XMM_REG(xmm1)
  67. POP_XMM_REG(xmm0) // 16
  68. POP_REG(r15)
  69. POP_REG(r14) // 16
  70. POP_REG(r13)
  71. POP_REG(r12) // 16
  72. POP_REG(r11)
  73. POP_REG(r10) // 16
  74. POP_REG(r9)
  75. POP_REG(r8) // 16
  76. POP_REG(rsi)
  77. POP_REG(rbp) // 16
  78. POP_REG(rdx)
  79. POP_REG(rcx) // 16
  80. POP_REG(rbx)
  81. POP_REG(rax) // 16
  82. POP_REG(rdi)
  83. ret
  84. .cfi_endproc
  85. /*
  86. * Copyright 2018-2019 Yury Gribov
  87. *
  88. * The MIT License (MIT)
  89. *
  90. * Use of this source code is governed by MIT license that can be
  91. * found in the LICENSE.txt file.
  92. */
  93. .globl NvEncodeAPICreateInstance
  94. .p2align 4
  95. .type NvEncodeAPICreateInstance, %function
  96. NvEncodeAPICreateInstance:
  97. .cfi_startproc
  98. // Intel opt. manual says to
  99. // "make the fall-through code following a conditional branch be the likely target for a branch with a forward target"
  100. // to hint static predictor.
  101. cmpq $0, _libnvidia_encode_so_tramp_table+0(%rip)
  102. je 2f
  103. 1:
  104. jmp *_libnvidia_encode_so_tramp_table+0(%rip)
  105. 2:
  106. pushq $0
  107. .cfi_adjust_cfa_offset 8
  108. call _libnvidia_encode_so_save_regs_and_resolve
  109. addq $8, %rsp
  110. .cfi_adjust_cfa_offset -8
  111. jmp 1b
  112. .cfi_endproc
  113. /*
  114. * Copyright 2018-2019 Yury Gribov
  115. *
  116. * The MIT License (MIT)
  117. *
  118. * Use of this source code is governed by MIT license that can be
  119. * found in the LICENSE.txt file.
  120. */
  121. .globl NvEncodeAPIGetMaxSupportedVersion
  122. .p2align 4
  123. .type NvEncodeAPIGetMaxSupportedVersion, %function
  124. NvEncodeAPIGetMaxSupportedVersion:
  125. .cfi_startproc
  126. // Intel opt. manual says to
  127. // "make the fall-through code following a conditional branch be the likely target for a branch with a forward target"
  128. // to hint static predictor.
  129. cmpq $0, _libnvidia_encode_so_tramp_table+8(%rip)
  130. je 2f
  131. 1:
  132. jmp *_libnvidia_encode_so_tramp_table+8(%rip)
  133. 2:
  134. pushq $1
  135. .cfi_adjust_cfa_offset 8
  136. call _libnvidia_encode_so_save_regs_and_resolve
  137. addq $8, %rsp
  138. .cfi_adjust_cfa_offset -8
  139. jmp 1b
  140. .cfi_endproc