/** * Flat lookup table: maps Unicode code point → ASCII digit (0-9). * Only decimal digit characters (Unicode category Nd) are included. * * Strategy: Int32Array of size (maxCodePoint - minCodePoint + 1). * Value 0xFF means "not a digit". Value 0-9 is the ASCII digit value. * This gives O(1) lookup with no branching, no bisect, no loop. * * Memory: range is 0x0660 to 0x1FBF0 → ~129,936 entries × 1 byte = ~127 KB. * Acceptable for a one-time init; lookup is a single array index. */ // All known Unicode Nd (decimal digit) script zero code points. // Each script has exactly 10 consecutive digits: zero+0 .. zero+9. const SCRIPT_ZEROS = [ // Basic Latin (ASCII) — included for completeness / pass-through 0x0030, // 0-9 // Arabic scripts 0x0660, // Arabic-Indic ٠١٢٣٤٥٦٧٨٩ 0x06F0, // Extended Arabic-Indic (Urdu/Persian/Sindhi) ۰۱۲۳ // Indic scripts 0x0966, // Devanagari ०१२३४५६७८९ 0x09E6, // Bengali ০১২৩৪৫৬৭৮৯ 0x0A66, // Gurmukhi ੦੧੨੩੪੫੬੭੮੯ 0x0AE6, // Gujarati ૦૧૨૩૪૫૬૭૮૯ 0x0B66, // Odia ୦୧୨୩୪୫୬୭୮୯ 0x0BE6, // Tamil ௦௧௨௩௪௫௬௭௮௯ 0x0C66, // Telugu ౦౧౨౩౪౫౬౭౮౯ 0x0CE6, // Kannada ೦೧೨೩೪೫೬೭೮೯ 0x0D66, // Malayalam ൦൧൨൩൪൫൬൭൮൯ 0x0DE6, // Sinhala Archaic ෦෧෨෩෪෫෬෭෮෯ // Southeast Asian scripts 0x0E50, // Thai ๐๑๒๓๔๕๖๗๘๙ 0x0ED0, // Lao ໐໑໒໓໔໕໖໗໘໙ 0x0F20, // Tibetan ༠༡༢༣༤༥༦༧༨༩ 0x1040, // Myanmar ၀၁၂၃၄၅၆၇၈၉ 0x1090, // Myanmar Shan ႐႑႒႓႔႕႖႗႘႙ 0x17E0, // Khmer ០១២៣៤៥៦៧៨៩ 0x1810, // Mongolian ᠐᠑᠒᠓᠔᠕᠖᠗᠘᠙ 0x1946, // Limbu ᥆᥇᥈᥉᥊᥋᥌᥍᥎᥏ 0x19D0, // New Tai Lue ᧐᧑᧒᧓᧔᧕᧖᧗᧘᧙ 0x1A80, // Tai Tham Hora ᪀᪁᪂᪃᪄᪅᪆᪇᪈᪉ 0x1A90, // Tai Tham Tham ᪐᪑᪒᪓᪔᪕᪖᪗᪘᪙ 0x1B50, // Balinese ᭐᭑᭒᭓᭔᭕᭖᭗᭘᭙ 0x1BB0, // Sundanese ᮰᮱᮲᮳᮴᮵᮶᮷᮸᮹ 0x1C40, // Lepcha ᱀᱁᱂᱃᱄᱅᱆᱇᱈᱉ 0x1C50, // Ol Chiki ᱐᱑᱒᱓᱔᱕᱖᱗᱘᱙ // Fullwidth (CJK context) 0xFF10, // Fullwidth 0123456789 // Mathematical digit variants (Unicode math block) 0x1D7CE, // Mathematical Bold 0x1D7D8, // Mathematical Double-Struck 0x1D7E2, // Mathematical Sans-Serif 0x1D7EC, // Mathematical Sans-Serif Bold 0x1D7F6, // Mathematical Monospace // Other scripts 0x104A0, // Osmanya 𐒠𐒡𐒢𐒣𐒤𐒥𐒦𐒧𐒨𐒩 0x10D30, // Hanifi Rohingya 𐴰𐴱𐴲𐴳𐴴𐴵𐴶𐴷𐴸𐴹 0x11066, // Brahmi 𑁦𑁧𑁨𑁩𑁪𑁫𑁬𑁭𑁮𑁯 0x110F0, // Sora Sompeng 𑃰𑃱𑃲𑃳𑃴𑃵𑃶𑃷𑃸𑃹 0x11136, // Chakma 𑄶𑄷𑄸𑄹𑄺𑄻𑄼𑄽𑄾𑄿 0x111D0, // Sharada 𑇐𑇑𑇒𑇓𑇔𑇕𑇖𑇗𑇘𑇙 0x112F0, // Khudawadi 𑋰𑋱𑋲𑋳𑋴𑋵𑋶𑋷𑋸𑋹 0x11450, // Newa 𑑐𑑑𑑒𑑓𑑔𑑕𑑖𑑗𑑘𑑙 0x114D0, // Tirhuta 𑓐𑓑𑓒𑓓𑓔𑓕𑓖𑓗𑓘𑓙 0x11650, // Modi 𑙐𑙑𑙒𑙓𑙔𑙕𑙖𑙗𑙘𑙙 0x116C0, // Takri 𑛀𑛁𑛂𑛃𑛄𑛅𑛆𑛇𑛈𑛉 0x11730, // Ahom 𑜰𑜱𑜲𑜳𑜴𑜵𑜶𑜷𑜸𑜹 0x118E0, // Warang Citi 𑣠𑣡𑣢𑣣𑣤𑣥𑣦𑣧𑣨𑣩 0x11950, // Dives Akuru 𑥐𑥑𑥒𑥓𑥔𑥕𑥖𑥗𑥘𑥙 0x11BF0, // Khitan Small Script 𑯰𑯱𑯲𑯳𑯴𑯵𑯶𑯷𑯸𑯹 0x11C50, // Bhaiksuki 𑱐𑱑𑱒𑱓𑱔𑱕𑱖𑱗𑱘𑱙 0x11D50, // Masaram Gondi 𑵐𑵑𑵒𑵓𑵔𑵕𑵖𑵗𑵘𑵙 0x11DA0, // Gunjala Gondi 𑶠𑶡𑶢𑶣𑶤𑶥𑶦𑶧𑶨𑶩 0x11F50, // Kawi 𑽐𑽑𑽒𑽓𑽔𑽕𑽖𑽗𑽘𑽙 0x16A60, // Mro 𖩠𖩡𖩢𖩣𖩤𖩥𖩦𖩧𖩨𖩩 0x16AC0, // Tangsa 𖫀𖫁𖫂𖫃𖫄𖫅𖫆𖫇𖫈𖫉 0x16B50, // Pahawh Hmong 𖭐𖭑𖭒𖭓𖭔𖭕𖭖𖭗𖭘𖭙 0x1E140, // Nyiakeng Puachue Hmong 𞅀𞅁𞅂𞅃𞅄𞅅𞅆𞅇𞅈𞅉 0x1E2F0, // Wancho 𞋰𞋱𞋲𞋳𞋴𞋵𞋶𞋷𞋸𞋹 0x1E4F0, // Nag Mundari 𞓰𞓱𞓲𞓳𞓴𞓵𞓶𞓷𞓸𞓹 0x1E950, // Adlam 𞥐𞥑𞥒𞥓𞥔𞥕𞥖𞥗𞥘𞥙 0x1FBF0, // Segmented digit symbols 🯰🯱🯲🯳🯴🯵🯶🯷🯸🯹 ]; // Build a sparse Map for scripts above 0xFFFF (surrogate-pair range). // These can't go into a flat Uint8Array indexed by code point efficiently. const NOT_DIGIT = 0xFF; const HIGH_MAP = new Map(); // codePoint → digit value (0-9) const LOW_MAX = 0xFFFF; const LOW_MIN = 0x0660; // first non-ASCII digit script // Flat Uint8Array covering 0x0660 .. 0xFFFF const TABLE_OFFSET = LOW_MIN; const TABLE_SIZE = LOW_MAX - LOW_MIN + 1; const TABLE = new Uint8Array(TABLE_SIZE).fill(NOT_DIGIT); for (const zero of SCRIPT_ZEROS) { for (let d = 0; d < 10; d++) { const cp = zero + d; if (cp <= LOW_MAX) { TABLE[cp - TABLE_OFFSET] = d; } else { HIGH_MAP.set(cp, d); } } } export { TABLE, TABLE_OFFSET, HIGH_MAP, NOT_DIGIT };