pdf_find_utils.js 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. /**
  2. * @licstart The following is the entire license notice for the
  3. * Javascript code in this page
  4. *
  5. * Copyright 2021 Mozilla Foundation
  6. *
  7. * Licensed under the Apache License, Version 2.0 (the "License");
  8. * you may not use this file except in compliance with the License.
  9. * You may obtain a copy of the License at
  10. *
  11. * http://www.apache.org/licenses/LICENSE-2.0
  12. *
  13. * Unless required by applicable law or agreed to in writing, software
  14. * distributed under the License is distributed on an "AS IS" BASIS,
  15. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. * See the License for the specific language governing permissions and
  17. * limitations under the License.
  18. *
  19. * @licend The above is the entire license notice for the
  20. * Javascript code in this page
  21. */
  22. "use strict";
  23. Object.defineProperty(exports, "__esModule", {
  24. value: true
  25. });
  26. exports.getCharacterType = getCharacterType;
  27. exports.CharacterType = void 0;
  28. const CharacterType = {
  29. SPACE: 0,
  30. ALPHA_LETTER: 1,
  31. PUNCT: 2,
  32. HAN_LETTER: 3,
  33. KATAKANA_LETTER: 4,
  34. HIRAGANA_LETTER: 5,
  35. HALFWIDTH_KATAKANA_LETTER: 6,
  36. THAI_LETTER: 7
  37. };
  38. exports.CharacterType = CharacterType;
  39. function isAlphabeticalScript(charCode) {
  40. return charCode < 0x2e80;
  41. }
  42. function isAscii(charCode) {
  43. return (charCode & 0xff80) === 0;
  44. }
  45. function isAsciiAlpha(charCode) {
  46. return charCode >= 0x61 && charCode <= 0x7a || charCode >= 0x41 && charCode <= 0x5a;
  47. }
  48. function isAsciiDigit(charCode) {
  49. return charCode >= 0x30 && charCode <= 0x39;
  50. }
  51. function isAsciiSpace(charCode) {
  52. return charCode === 0x20 || charCode === 0x09 || charCode === 0x0d || charCode === 0x0a;
  53. }
  54. function isHan(charCode) {
  55. return charCode >= 0x3400 && charCode <= 0x9fff || charCode >= 0xf900 && charCode <= 0xfaff;
  56. }
  57. function isKatakana(charCode) {
  58. return charCode >= 0x30a0 && charCode <= 0x30ff;
  59. }
  60. function isHiragana(charCode) {
  61. return charCode >= 0x3040 && charCode <= 0x309f;
  62. }
  63. function isHalfwidthKatakana(charCode) {
  64. return charCode >= 0xff60 && charCode <= 0xff9f;
  65. }
  66. function isThai(charCode) {
  67. return (charCode & 0xff80) === 0x0e00;
  68. }
  69. function getCharacterType(charCode) {
  70. if (isAlphabeticalScript(charCode)) {
  71. if (isAscii(charCode)) {
  72. if (isAsciiSpace(charCode)) {
  73. return CharacterType.SPACE;
  74. } else if (isAsciiAlpha(charCode) || isAsciiDigit(charCode) || charCode === 0x5f) {
  75. return CharacterType.ALPHA_LETTER;
  76. }
  77. return CharacterType.PUNCT;
  78. } else if (isThai(charCode)) {
  79. return CharacterType.THAI_LETTER;
  80. } else if (charCode === 0xa0) {
  81. return CharacterType.SPACE;
  82. }
  83. return CharacterType.ALPHA_LETTER;
  84. }
  85. if (isHan(charCode)) {
  86. return CharacterType.HAN_LETTER;
  87. } else if (isKatakana(charCode)) {
  88. return CharacterType.KATAKANA_LETTER;
  89. } else if (isHiragana(charCode)) {
  90. return CharacterType.HIRAGANA_LETTER;
  91. } else if (isHalfwidthKatakana(charCode)) {
  92. return CharacterType.HALFWIDTH_KATAKANA_LETTER;
  93. }
  94. return CharacterType.ALPHA_LETTER;
  95. }