135 lines
4.2 KiB
JavaScript
135 lines
4.2 KiB
JavaScript
'use strict';
|
||
|
||
import { TABLE, TABLE_OFFSET, HIGH_MAP, NOT_DIGIT } from './digitTable.js';
|
||
|
||
const CHAR_0 = 48; // '0'.charCodeAt(0)
|
||
const CHAR_9 = 57; // '9'.charCodeAt(0)
|
||
const CHAR_MINUS = 45; // '-'.charCodeAt(0)
|
||
|
||
// Unicode minus/hyphen variants worth normalizing to ASCII '-' in numeric context:
|
||
// U+2212 MINUS SIGN − (mathematically correct minus)
|
||
// U+FF0D FULLWIDTH HYPHEN-MINUS - (Japanese fullwidth context)
|
||
// U+FE63 SMALL HYPHEN-MINUS ﹣ (small form variant)
|
||
//
|
||
// NOT normalized (deliberate):
|
||
// U+2013 EN DASH – (punctuation, not a numeric sign)
|
||
// U+2014 EM DASH — (punctuation)
|
||
// U+2010 HYPHEN ‐ (typographic hyphen)
|
||
//
|
||
// Rationale: only characters a human or locale formatter would plausibly use
|
||
// as a numeric minus sign are normalized. Dashes used for punctuation are left
|
||
// alone to avoid mangling non-numeric strings.
|
||
const MINUS_SET = new Set([0x2212, 0xFF0D, 0xFE63]);
|
||
|
||
/**
|
||
* Normalize all Unicode decimal digit characters in a string to ASCII (0-9),
|
||
* and normalize Unicode minus variants to ASCII '-' (U+002D).
|
||
*
|
||
* Non-digit, non-minus characters are passed through unchanged.
|
||
*
|
||
* Performance design:
|
||
* - Fast path: if the string has no convertible characters, return it unchanged
|
||
* (zero allocation).
|
||
* - BMP digits (0x0660..0xFFFF excl. surrogates): flat Uint8Array lookup (O(1)).
|
||
* - Supplementary plane digits (> 0xFFFF, encoded as surrogate pairs): Map lookup.
|
||
* - Minus variants: checked inline with a small fixed Set.
|
||
*
|
||
* @param {string} str
|
||
* @returns {string}
|
||
*/
|
||
function anynum(str) {
|
||
if (typeof str !== 'string') return str;
|
||
|
||
const len = str.length;
|
||
if (len === 0) return str;
|
||
|
||
// Scan for first character needing conversion.
|
||
// If none found, return original string (zero allocation).
|
||
let firstHit = -1;
|
||
|
||
for (let i = 0; i < len; i++) {
|
||
const cc = str.charCodeAt(i);
|
||
|
||
// ASCII digit or ASCII minus — already normalized, skip fast
|
||
if ((cc >= CHAR_0 && cc <= CHAR_9) || cc === CHAR_MINUS) continue;
|
||
|
||
// Below first unicode digit script — check minus variants only
|
||
if (cc < TABLE_OFFSET) {
|
||
if (MINUS_SET.has(cc)) { firstHit = i; break; }
|
||
continue;
|
||
}
|
||
|
||
// Surrogate pairs live in BMP range 0xD800-0xDFFF — check before TABLE
|
||
if (cc >= 0xD800 && cc <= 0xDBFF) {
|
||
if (i + 1 < len) {
|
||
const low = str.charCodeAt(i + 1);
|
||
if (low >= 0xDC00 && low <= 0xDFFF) {
|
||
const cp = 0x10000 + ((cc - 0xD800) << 10) + (low - 0xDC00);
|
||
if (HIGH_MAP.has(cp)) { firstHit = i; break; }
|
||
}
|
||
}
|
||
continue;
|
||
}
|
||
|
||
// BMP non-surrogate: flat table lookup; also check minus variants in this range
|
||
if (TABLE[cc - TABLE_OFFSET] !== NOT_DIGIT || MINUS_SET.has(cc)) {
|
||
firstHit = i;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Nothing to replace — return original, zero allocation
|
||
if (firstHit === -1) return str;
|
||
|
||
// Build result: copy unchanged prefix, then convert from firstHit onward
|
||
const chars = [];
|
||
|
||
if (firstHit > 0) chars.push(str.slice(0, firstHit));
|
||
|
||
for (let i = firstHit; i < len; i++) {
|
||
const cc = str.charCodeAt(i);
|
||
|
||
// ASCII digit or ASCII minus — pass through
|
||
if ((cc >= CHAR_0 && cc <= CHAR_9) || cc === CHAR_MINUS) {
|
||
chars.push(str[i]);
|
||
continue;
|
||
}
|
||
|
||
// Below TABLE_OFFSET — check minus variants, else pass through
|
||
if (cc < TABLE_OFFSET) {
|
||
chars.push(MINUS_SET.has(cc) ? '-' : str[i]);
|
||
continue;
|
||
}
|
||
|
||
// Surrogate pairs
|
||
if (cc >= 0xD800 && cc <= 0xDBFF) {
|
||
if (i + 1 < len) {
|
||
const low = str.charCodeAt(i + 1);
|
||
if (low >= 0xDC00 && low <= 0xDFFF) {
|
||
const cp = 0x10000 + ((cc - 0xD800) << 10) + (low - 0xDC00);
|
||
const d = HIGH_MAP.get(cp);
|
||
if (d !== undefined) {
|
||
chars.push(String.fromCharCode(d + 48));
|
||
i++; // consume low surrogate
|
||
continue;
|
||
}
|
||
}
|
||
}
|
||
chars.push(str[i]);
|
||
continue;
|
||
}
|
||
|
||
// BMP non-surrogate: flat table lookup + minus variants
|
||
if (MINUS_SET.has(cc)) {
|
||
chars.push('-');
|
||
continue;
|
||
}
|
||
const d = TABLE[cc - TABLE_OFFSET];
|
||
chars.push(d !== NOT_DIGIT ? String.fromCharCode(d + 48) : str[i]);
|
||
}
|
||
|
||
return chars.join('');
|
||
}
|
||
|
||
export { anynum };
|
||
export default anynum; |