utf8_util.c 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. /* Copyright 2013 Google Inc. All Rights Reserved.
  2. Distributed under MIT license.
  3. See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
  4. */
  5. /* Heuristics for deciding about the UTF8-ness of strings. */
  6. #include "./utf8_util.h"
  7. #include <brotli/types.h>
  8. #if defined(__cplusplus) || defined(c_plusplus)
  9. extern "C" {
  10. #endif
  11. static size_t BrotliParseAsUTF8(
  12. int* symbol, const uint8_t* input, size_t size) {
  13. /* ASCII */
  14. if ((input[0] & 0x80) == 0) {
  15. *symbol = input[0];
  16. if (*symbol > 0) {
  17. return 1;
  18. }
  19. }
  20. /* 2-byte UTF8 */
  21. if (size > 1u &&
  22. (input[0] & 0xE0) == 0xC0 &&
  23. (input[1] & 0xC0) == 0x80) {
  24. *symbol = (((input[0] & 0x1F) << 6) |
  25. (input[1] & 0x3F));
  26. if (*symbol > 0x7F) {
  27. return 2;
  28. }
  29. }
  30. /* 3-byte UFT8 */
  31. if (size > 2u &&
  32. (input[0] & 0xF0) == 0xE0 &&
  33. (input[1] & 0xC0) == 0x80 &&
  34. (input[2] & 0xC0) == 0x80) {
  35. *symbol = (((input[0] & 0x0F) << 12) |
  36. ((input[1] & 0x3F) << 6) |
  37. (input[2] & 0x3F));
  38. if (*symbol > 0x7FF) {
  39. return 3;
  40. }
  41. }
  42. /* 4-byte UFT8 */
  43. if (size > 3u &&
  44. (input[0] & 0xF8) == 0xF0 &&
  45. (input[1] & 0xC0) == 0x80 &&
  46. (input[2] & 0xC0) == 0x80 &&
  47. (input[3] & 0xC0) == 0x80) {
  48. *symbol = (((input[0] & 0x07) << 18) |
  49. ((input[1] & 0x3F) << 12) |
  50. ((input[2] & 0x3F) << 6) |
  51. (input[3] & 0x3F));
  52. if (*symbol > 0xFFFF && *symbol <= 0x10FFFF) {
  53. return 4;
  54. }
  55. }
  56. /* Not UTF8, emit a special symbol above the UTF8-code space */
  57. *symbol = 0x110000 | input[0];
  58. return 1;
  59. }
  60. /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
  61. BROTLI_BOOL BrotliIsMostlyUTF8(
  62. const uint8_t* data, const size_t pos, const size_t mask,
  63. const size_t length, const double min_fraction) {
  64. size_t size_utf8 = 0;
  65. size_t i = 0;
  66. while (i < length) {
  67. int symbol;
  68. size_t bytes_read =
  69. BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
  70. i += bytes_read;
  71. if (symbol < 0x110000) size_utf8 += bytes_read;
  72. }
  73. return TO_BROTLI_BOOL((double)size_utf8 > min_fraction * (double)length);
  74. }
  75. #if defined(__cplusplus) || defined(c_plusplus)
  76. } /* extern "C" */
  77. #endif