BillTracker/services/spreadsheetImportService.js

1389 lines
51 KiB
JavaScript
Raw Normal View History

2026-05-03 19:51:57 -05:00
'use strict';
// Security note: xlsx (SheetJS Community Edition) has known prototype-pollution
// and ReDoS CVEs (no OSS fix available as of 2026). Mitigations applied here:
// 1. cellFormula: false — never parse/execute formulas
// 2. cellHTML: false — never parse HTML markup
// 3. 10 MB file-size cap (enforced by caller via express.raw limit)
// 4. XLSX magic-bytes check before parsing
// 5. Endpoint requires authenticated session; no anonymous uploads
// 6. All cells treated as plain string data; no formula result access
const xlsx = require('xlsx');
const crypto = require('crypto');
const { getDb, ensureUserDefaultCategories } = require('../db/database');
// ─── Constants ────────────────────────────────────────────────────────────────
const SESSION_TTL_MS = 24 * 60 * 60 * 1000; // 24 h
const MAX_ROWS = 5_000;
const LABEL_PATTERNS = {
autopay: /\bauto(?:pay)?\b/i,
past_due: /\bpast\s*due\b/i,
double_pay: /\bdouble\s*pay\b/i,
skipped: /\b(?:skip(?:ped)?|n\/?a|not\s*applicable)\b/i,
};
const HEADER_PATTERNS = {
bill_name: /^(?:bill|name|bill\s*name|description|payee|vendor|service)$/i,
amount: /^(?:amount|amt|expected|expected\s*amount|cost|price|payment|paid|value)$/i,
due_date: /^(?:due\s*date|due|due\s*day)$/i,
paid_date: /^(?:paid\s*date|date\s*paid|payment\s*date|date\s*cleared|cleared\s*date)$/i,
date: /^(?:date|due\s*date|due|paid\s*date|when|day)$/i,
category: /^(?:category|cat|type|group)$/i,
notes: /^(?:notes?|comment|label|status|memo|remark)$/i,
};
const CATEGORY_KEYWORDS = [
{ words: ['electric', 'power', 'utility', 'utilities', 'water', 'sewer', 'gas'], categories: ['Utilities'] },
{ words: ['netflix', 'hulu', 'disney', 'spotify', 'subscription', 'streaming'], categories: ['Streaming', 'Entertainment', 'Subscriptions'] },
{ words: ['capital one', 'credit card', 'discover', 'visa', 'mastercard', 'amex'], categories: ['Credit Cards', 'Credit Card', 'Cards'] },
{ words: ['rent', 'mortgage'], categories: ['Housing'] },
{ words: ['insurance'], categories: ['Insurance'] },
{ words: ['loan', 'car payment', 'auto loan'], categories: ['Loans'] },
];
// Sheet names that clearly are not month/bill-data tabs
const NON_MONTH_SHEET_RE = /^(?:summary|totals?|dashboard|info|notes?|categories|settings?|overview|index|readme|instructions?|help|template|data|master|all|annual|yearly|archive|backup|charts?|graphs?|sheet\d+|.*tax(?:es)?|.*debt.*to.*als?|home\s*ownership\s*expenses)$/i;
// Full month-name lookup (abbrev and full forms)
const MONTH_LOOKUP = {
jan: 1, january: 1, januaru: 1,
feb: 2, february: 2, febuary: 2,
mar: 3, march: 3,
apr: 4, april: 4,
may: 5,
jun: 6, june: 6,
jul: 7, july: 7,
aug: 8, august: 8,
sep: 9, sept: 9, september: 9,
oct: 10, october: 10,
nov: 11, november: 11, novevmber: 11,
dec: 12, december: 12,
};
const MONTH_KEYS_BY_LENGTH = Object.keys(MONTH_LOOKUP).sort((a, b) => b.length - a.length);
// ─── Sheet Name Parsing ───────────────────────────────────────────────────────
/**
* Detect year and month from a worksheet tab name.
* Returns { year, month, is_non_month_sheet }.
* year and month may be null if not determinable.
* Exported for testing.
*/
function parseSheetName(name) {
const clean = (name || '').trim();
// Known non-data sheet names → skip
if (NON_MONTH_SHEET_RE.test(clean)) {
return { year: null, month: null, is_non_month_sheet: true };
}
// YYYY-MM or YYYY/MM (e.g. "2026-01", "2026/5")
let m = clean.match(/^(\d{4})[\/\-](\d{1,2})$/);
if (m) {
const yr = parseInt(m[1], 10), mo = parseInt(m[2], 10);
if (yr >= 2000 && yr <= 2100 && mo >= 1 && mo <= 12)
return { year: yr, month: mo, is_non_month_sheet: false };
}
// MM-YYYY or MM/YYYY (e.g. "01-2026", "1/2026")
m = clean.match(/^(\d{1,2})[\/\-](\d{4})$/);
if (m) {
const mo = parseInt(m[1], 10), yr = parseInt(m[2], 10);
if (yr >= 2000 && yr <= 2100 && mo >= 1 && mo <= 12)
return { year: yr, month: mo, is_non_month_sheet: false };
}
// Extract 4-digit year if present anywhere in the name. Some real workbook
// tabs use names like "July2017", so this cannot rely on word boundaries.
const yearM = clean.match(/(20\d{2})/);
const year = yearM ? parseInt(yearM[1], 10) : null;
// Extract a month name (full, abbreviated, common typos, or compact
// MonthYYYY / YYYYMonth forms from legacy spreadsheets).
const lower = clean.toLowerCase();
const compact = lower.replace(/[^a-z0-9]/g, '');
for (const key of MONTH_KEYS_BY_LENGTH) {
const wordMatch = new RegExp(`(^|[^a-z])${key}([^a-z]|$)`, 'i').test(lower);
const compactMonthYear = year && (
compact.includes(`${key}${year}`) ||
compact.includes(`${year}${key}`)
);
if (wordMatch || compactMonthYear) {
return { year, month: MONTH_LOOKUP[key], is_non_month_sheet: false };
}
}
// No month found — year might still be present (ambiguous)
return { year, month: null, is_non_month_sheet: false };
}
// ─── XLSX Parsing ─────────────────────────────────────────────────────────────
function isXlsxBuffer(buffer) {
// XLSX = ZIP file: magic bytes PK\x03\x04
return (
Buffer.isBuffer(buffer) &&
buffer.length >= 4 &&
buffer[0] === 0x50 && buffer[1] === 0x4b &&
buffer[2] === 0x03 && buffer[3] === 0x04
);
}
function parseXlsxBuffer(buffer) {
if (!isXlsxBuffer(buffer)) {
const err = new Error('Invalid file format. Only XLSX files are supported.');
err.status = 400;
throw err;
}
let workbook;
try {
workbook = xlsx.read(buffer, {
type: 'buffer',
cellFormula: false,
cellHTML: false,
cellNF: false,
cellStyles: false,
dense: false,
});
} catch {
const err = new Error('Could not read XLSX file. The file may be corrupted or in an unsupported format.');
err.status = 400;
throw err;
}
if (!workbook.SheetNames.length) {
const err = new Error('XLSX file contains no sheets.');
err.status = 400;
throw err;
}
return workbook;
}
function getSheetRows(workbook, sheetName) {
const sheet = workbook.Sheets[sheetName];
if (!sheet) return [];
// raw:false → formatted string values; no formula results can leak through
return xlsx.utils.sheet_to_json(sheet, { header: 1, defval: null, raw: false });
}
// ─── Header Detection ─────────────────────────────────────────────────────────
function detectHeaders(firstRow) {
if (!Array.isArray(firstRow)) return {};
const map = {};
firstRow.forEach((cell, idx) => {
if (cell == null) return;
const val = String(cell).trim();
for (const [field, pattern] of Object.entries(HEADER_PATTERNS)) {
if (pattern.test(val) && !(field in map)) map[field] = idx;
}
});
return map;
}
// ─── Row Classification ───────────────────────────────────────────────────────
function isBlankRow(cells) {
return cells.every((c) => c == null || String(c).trim() === '');
}
function isLikelyHeaderRow(cells) {
const nonEmpty = cells.filter((c) => c != null && String(c).trim() !== '');
if (nonEmpty.length === 0) return false;
let matches = 0;
for (const cell of nonEmpty) {
for (const pattern of Object.values(HEADER_PATTERNS)) {
if (pattern.test(String(cell).trim())) { matches++; break; }
}
}
return matches >= Math.ceil(nonEmpty.length * 0.5);
}
function isLikelyTotalRow(cells) {
return cells.some(
(c) => c != null && /^(?:total|subtotal|sum|grand\s*total)$/i.test(String(c).trim()),
);
}
// ─── Cell-Value Parsers ───────────────────────────────────────────────────────
function parseAmount(raw) {
if (raw == null) return null;
const str = String(raw).trim();
if (!str) return null;
const cleaned = str.replace(/[$,\s]/g, '').replace(/^\((.+)\)$/, '-$1');
const val = parseFloat(cleaned);
return Number.isFinite(val) ? val : null;
}
function parseDate(raw) {
if (raw == null) return null;
const str = String(raw).trim();
if (!str) return null;
// MM/DD/YYYY or M/D/YYYY
let m = str.match(/^(\d{1,2})\/(\d{1,2})\/(\d{4})$/);
if (m) {
const [, mon, day, yr] = m;
return {
year: parseInt(yr, 10),
month: parseInt(mon, 10),
day: parseInt(day, 10),
iso: `${yr}-${mon.padStart(2, '0')}-${day.padStart(2, '0')}`,
};
}
// YYYY-MM-DD
m = str.match(/^(\d{4})-(\d{2})-(\d{2})$/);
if (m) {
const [, yr, mon, day] = m;
return { year: parseInt(yr, 10), month: parseInt(mon, 10), day: parseInt(day, 10), iso: str };
}
// MM/DD (no year)
m = str.match(/^(\d{1,2})\/(\d{1,2})$/);
if (m) {
return { year: null, month: parseInt(m[1], 10), day: parseInt(m[2], 10), iso: null };
}
// Month name: "May 2026" or "May"
m = str.match(/^(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s*(\d{4})?$/i);
if (m) {
const key = m[1].toLowerCase();
const month = MONTH_LOOKUP[key] ?? MONTH_LOOKUP[key.slice(0, 3)];
if (month) return { year: m[2] ? parseInt(m[2], 10) : null, month, day: null, iso: null };
}
return null;
}
function resolveDateIso(parsedDate, fallbackYear) {
if (!parsedDate?.month || !parsedDate?.day) return null;
const year = parsedDate.year ?? fallbackYear ?? null;
if (!year || year < 2000 || year > 2100) return null;
if (parsedDate.month < 1 || parsedDate.month > 12 || parsedDate.day < 1 || parsedDate.day > 31) return null;
return `${year}-${String(parsedDate.month).padStart(2, '0')}-${String(parsedDate.day).padStart(2, '0')}`;
}
function detectLabels(text) {
if (!text) return [];
return Object.entries(LABEL_PATTERNS)
.filter(([, pattern]) => pattern.test(text))
.map(([label]) => label);
}
// ─── Name Normalization ───────────────────────────────────────────────────────
function normalizeName(name) {
return String(name)
.trim()
.toLowerCase()
.replace(/\s+/g, ' ')
.replace(/[.,!?;:'"()]/g, '')
.trim();
}
// ─── Bill Matching ────────────────────────────────────────────────────────────
function canonicalizeName(name) {
const normalized = normalizeName(name)
.replace(/\bcap\b/g, 'capital')
.replace(/\bcc\b/g, 'credit card')
.replace(/\bco\b/g, 'company')
.replace(/\bsvc\b/g, 'service')
.replace(/\bservices\b/g, 'service');
return normalized
.split(' ')
.filter((t) => t && !['the', 'bill', 'payment', 'pay', 'paid'].includes(t))
.join(' ')
.trim();
}
function nameTokens(name) {
return canonicalizeName(name)
.split(' ')
.filter((t) => t.length > 1 && !['card', 'credit'].includes(t));
}
function matchScore(detectedName, bill) {
const target = canonicalizeName(detectedName);
const candidate = canonicalizeName(bill.name);
if (!target || !candidate) return null;
if (candidate === target) {
return { match_confidence: 'high', match_reason: 'exact_normalized_match', score: 100 };
}
const tTokens = nameTokens(target);
const bTokens = nameTokens(candidate);
if (!tTokens.length || !bTokens.length) return null;
const tSet = new Set(tTokens);
const bSet = new Set(bTokens);
const overlap = tTokens.filter((t) => bSet.has(t));
const overlapRatio = overlap.length / Math.max(tSet.size, bSet.size);
const sameTokens = overlap.length === tSet.size && overlap.length === bSet.size;
if (sameTokens) {
return { match_confidence: 'medium', match_reason: 'token_reorder_match', score: 86 };
}
if (candidate.includes(target) || target.includes(candidate)) {
const targetIsShortSubset = tTokens.length === 1 && bTokens.length > 1;
const extraTokens = bTokens.filter((t) => !tSet.has(t));
const isKnownStrongSubset = tTokens[0] === 'capital' && extraTokens.includes('one');
return {
match_confidence: targetIsShortSubset && !isKnownStrongSubset ? 'low' : 'medium',
match_reason: isKnownStrongSubset ? 'contains_match: Capital -> Capital One' : 'contains_match',
score: isKnownStrongSubset ? 82 : (targetIsShortSubset ? 45 : 72),
};
}
if (overlap.length > 0 && overlapRatio >= 0.5) {
return { match_confidence: 'medium', match_reason: 'token_overlap_match', score: Math.round(60 + overlapRatio * 20) };
}
if (overlap.length > 0) {
return { match_confidence: 'low', match_reason: 'weak_token_overlap', score: 35 };
}
return null;
}
function findBillMatches(detectedName, userBills) {
if (!detectedName || !userBills.length) return [];
const matches = [];
for (const bill of userBills) {
const scored = matchScore(detectedName, bill);
if (!scored) continue;
matches.push({
bill_id: bill.id,
bill_name: bill.name,
category_id: bill.category_id ?? null,
category: bill.category_name || null,
expected_amount: bill.expected_amount,
due_day: bill.due_day ?? null,
match_confidence: scored.match_confidence,
match_reason: scored.match_reason,
score: scored.score,
});
}
const ORDER = { high: 0, medium: 1, low: 2 };
matches.sort((a, b) => ORDER[a.match_confidence] - ORDER[b.match_confidence] || b.score - a.score);
return matches;
}
// ─── Recommendation Helpers ──────────────────────────────────────────────────
function normalizeCategoryName(name) {
return normalizeName(name).replace(/&/g, 'and');
}
function findCategoryByName(name, categories) {
if (!name) return null;
const target = normalizeCategoryName(name);
return categories.find((c) => normalizeCategoryName(c.name) === target) || null;
}
function inferCategory({ billName, detectedCategory, notesText, topMatch }, categories) {
if (!categories.length) return null;
const explicit = findCategoryByName(detectedCategory, categories);
if (explicit) return { category: explicit, reason: 'category_column_match' };
if (topMatch?.category_id && topMatch?.category) {
return {
category: { id: topMatch.category_id, name: topMatch.category },
reason: `matched_bill_category: ${topMatch.bill_name}`,
};
}
const text = canonicalizeName([billName, detectedCategory, notesText].filter(Boolean).join(' '));
for (const rule of CATEGORY_KEYWORDS) {
if (!rule.words.some((word) => text.includes(canonicalizeName(word)))) continue;
for (const catName of rule.categories) {
const category = findCategoryByName(catName, categories);
if (category) return { category, reason: `keyword_category_match: ${catName}` };
}
}
return null;
}
function isPaymentDateHeader(header) {
return /\b(?:paid|payment|pay)\b/i.test(String(header || ''));
}
function isDueDateHeader(header) {
return /\bdue\b/i.test(String(header || ''));
}
function amountsDiffer(a, b) {
if (a == null || b == null) return false;
return Math.abs(Number(a) - Number(b)) >= 0.01;
}
function buildRecommendation({
billName,
detectedAmount,
parsedDate,
dateHeader,
detectedCategory,
notesText,
possibleMatches,
categories,
warnings,
errors,
paymentDateIso,
}) {
const recWarnings = [...warnings];
const topMatch = possibleMatches[0] || null;
const highMatches = possibleMatches.filter((m) => m.match_confidence === 'high');
const mediumMatches = possibleMatches.filter((m) => m.match_confidence === 'medium');
const dateDay = parsedDate?.day;
const dueDay = Number.isInteger(dateDay) && dateDay >= 1 && dateDay <= 31 ? dateDay : null;
const paymentDate = isPaymentDateHeader(dateHeader);
if (dueDay && paymentDate && !isDueDateHeader(dateHeader)) {
recWarnings.push('Date appears to be a payment date, not a due date');
}
let action = 'create_new_bill';
let confidence = 'low';
let reason = 'no_bill_match';
let selectedMatch = null;
if (errors.includes('No bill name detected')) {
action = 'skip_row';
confidence = 'high';
reason = 'blank_or_summary_row';
} else if (highMatches.length === 1) {
action = 'match_existing_bill';
confidence = 'high';
reason = highMatches[0].match_reason;
selectedMatch = highMatches[0];
} else if (highMatches.length > 1) {
action = 'ambiguous';
confidence = 'low';
reason = 'multiple_exact_matches';
recWarnings.push('Multiple exact matches — choose one');
} else if (mediumMatches.length === 1) {
action = 'match_existing_bill';
confidence = paymentDate ? 'low' : 'medium';
reason = mediumMatches[0].match_reason;
selectedMatch = mediumMatches[0];
recWarnings.push('Suggested match should be reviewed before apply');
} else if (possibleMatches.length > 0) {
action = 'ambiguous';
confidence = 'low';
reason = possibleMatches.length > 1 ? 'multiple_possible_matches' : possibleMatches[0].match_reason;
recWarnings.push('Possible weak match found — choose a bill, create a new bill, or skip');
}
const categoryGuess = inferCategory({
billName,
detectedCategory,
notesText,
topMatch: selectedMatch || (possibleMatches.length === 1 ? possibleMatches[0] : null),
}, categories);
if (action === 'create_new_bill' && !categoryGuess) {
recWarnings.push('Category uncertain — no existing category matched clearly');
}
if (selectedMatch && dueDay && selectedMatch.due_day && dueDay !== selectedMatch.due_day) {
recWarnings.push('Spreadsheet due day differs from current bill due day');
}
if (selectedMatch && amountsDiffer(detectedAmount, selectedMatch.expected_amount)) {
recWarnings.push('Spreadsheet amount differs from current bill expected amount');
}
return {
action,
bill_id: selectedMatch?.bill_id ?? null,
bill_name: selectedMatch?.bill_name ?? (action === 'create_new_bill' ? billName : null),
category_id: categoryGuess?.category?.id ?? null,
category_name: categoryGuess?.category?.name ?? null,
due_day: dueDay,
expected_amount: action === 'create_new_bill' && detectedAmount != null ? detectedAmount : null,
actual_amount: detectedAmount,
payment_amount: paymentDateIso && detectedAmount != null ? detectedAmount : null,
payment_date: paymentDateIso,
confidence,
reason: categoryGuess?.reason && action === 'create_new_bill' ? `${reason}; ${categoryGuess.reason}` : reason,
warnings: [...new Set(recWarnings)],
};
}
// ─── Year/Month Resolution ────────────────────────────────────────────────────
/**
* Determine the final year/month for a row and where they came from.
* Priority: full row date > partial row date (sheet/default year) > sheet name > default > ambiguous.
* Never falls back to today's date.
* Exported for testing.
*/
function resolveYearMonth(parsedDate, sheetYear, sheetMonth, defaultYear, defaultMonth) {
// Full date with year in the row cell
if (parsedDate?.year && parsedDate?.month) {
return { year: parsedDate.year, month: parsedDate.month, source: 'row_date', warnings: [] };
}
// Partial date (month/day, no year) in row — supplement year from sheet or default
if (parsedDate?.month && !parsedDate?.year) {
const year = sheetYear ?? defaultYear ?? null;
const source = year ? (sheetYear ? 'sheet_name' : 'default') : null;
const w = year ? [] : ['Year unknown — row date has no year; provide default_year'];
return { year, month: parsedDate.month, source: source || 'ambiguous', warnings: w };
}
// No date in row — use sheet tab name values
if (sheetMonth) {
const year = sheetYear ?? defaultYear ?? null;
const hasYear = year !== null;
return {
year,
month: sheetMonth,
source: hasYear ? 'sheet_name' : 'ambiguous',
warnings: hasYear ? [] : ['Year unknown — sheet name has no year; provide default_year'],
};
}
// No month from row or sheet — last resort: request defaults
if (defaultMonth) {
const year = defaultYear ?? null;
return {
year,
month: defaultMonth,
source: 'default',
warnings: year ? [] : ['Year unknown — provide default_year'],
};
}
// Nothing available
return { year: null, month: null, source: 'ambiguous', warnings: ['No year or month could be determined'] };
}
// ─── Row Analysis Helpers ─────────────────────────────────────────────────────
function findFirstAmountCell(cells, skipIndices) {
for (let i = 0; i < cells.length; i++) {
if (skipIndices.has(i) || cells[i] == null) continue;
const v = parseAmount(cells[i]);
if (v !== null && v > 0) return cells[i];
}
return null;
}
function collectNotesCells(cells, headerMap, billName) {
const skipIndices = new Set(Object.values(headerMap));
const parts = [];
for (let i = 0; i < cells.length; i++) {
if (skipIndices.has(i) || cells[i] == null) continue;
const val = String(cells[i]).trim();
if (!val) continue;
if (parseAmount(val) !== null) continue;
if (billName && normalizeName(val) === normalizeName(billName)) continue;
const parsed = parseDate(val);
if (parsed && (parsed.year || parsed.month)) continue;
parts.push(val);
}
return parts.join(' ').trim() || null;
}
// ─── Single-Row Analyzer ──────────────────────────────────────────────────────
function analyzeRow(rowIndex, cells, headerMap, headerLabels, userBills, categories, sheetName, sheetYear, sheetMonth, defaultYear, defaultMonth, rowIdPrefix) {
const get = (field) => {
const idx = headerMap[field];
return idx !== undefined ? cells[idx] : undefined;
};
const rawBillName = get('bill_name') ?? cells[0];
const billName = rawBillName ? String(rawBillName).trim() || null : null;
const skipIndices = new Set(Object.values(headerMap));
const rawAmount = get('amount') ?? findFirstAmountCell(cells, skipIndices);
const detectedAmount = parseAmount(rawAmount);
const parsedDueDate = parseDate(get('due_date'));
const parsedGenericDate = parseDate(get('date'));
const parsedDate = parsedDueDate ?? parsedGenericDate;
const dateHeader = headerMap.due_date !== undefined
? headerLabels[headerMap.due_date]
: (headerMap.date !== undefined ? headerLabels[headerMap.date] : null);
const {
year: detectedYear,
month: detectedMonth,
source: yearMonthSource,
warnings: ymWarnings,
} = resolveYearMonth(parsedDate, sheetYear, sheetMonth, defaultYear, defaultMonth);
const detectedDate = parsedDate?.iso ?? resolveDateIso(parsedDate, sheetYear ?? defaultYear ?? null);
const parsedPaidDate = parseDate(get('paid_date'));
const paidDateYear = parsedPaidDate?.year ?? sheetYear ?? detectedYear ?? defaultYear ?? null;
const detectedPaidDate = resolveDateIso(parsedPaidDate, paidDateYear);
const rawCategory = get('category');
const detectedCategory = rawCategory ? String(rawCategory).trim() || null : null;
const notesText = collectNotesCells(cells, headerMap, billName);
const allText = cells.filter((c) => c != null && typeof c === 'string').map((c) => c.trim()).join(' ');
const detectedLabels = detectLabels(allText);
const rawValues = cells.map((c) => (c != null ? String(c) : null));
const warnings = [...ymWarnings];
const errors = [];
if (!billName) errors.push('No bill name detected');
if (detectedAmount === null) warnings.push('No amount detected');
const possibleMatches = billName ? findBillMatches(billName, userBills) : [];
const recommendation = buildRecommendation({
billName,
detectedAmount,
parsedDate,
dateHeader,
detectedCategory,
notesText,
possibleMatches,
categories,
warnings,
errors,
paymentDateIso: detectedPaidDate,
});
const proposedAction = recommendation.action === 'ambiguous' ? 'mark_ambiguous' : recommendation.action;
const confidence = recommendation.confidence;
const requiresUserDecision = recommendation.action === 'ambiguous';
const rowId = rowIdPrefix ? `${rowIdPrefix}_r${rowIndex + 1}` : `row_${rowIndex + 1}`;
return {
row_id: rowId,
source_row_number: rowIndex + 1,
sheet_name: sheetName,
year_month_source: yearMonthSource,
raw_values: rawValues,
detected_bill_name: billName,
detected_category: detectedCategory,
detected_amount: detectedAmount,
detected_date: detectedDate,
detected_paid_date: detectedPaidDate,
detected_payment_amount: detectedPaidDate && detectedAmount != null ? detectedAmount : null,
detected_year: detectedYear,
detected_month: detectedMonth,
detected_notes: notesText,
detected_labels: detectedLabels,
proposed_action: proposedAction,
confidence,
warnings: recommendation.warnings,
errors,
possible_bill_matches: possibleMatches,
requires_user_decision: requiresUserDecision,
recommendation,
};
}
// ─── Sheet Row Parser ─────────────────────────────────────────────────────────
/**
* Parse all data rows from one sheet's rawRows array.
* sheetYear/sheetMonth come from parseSheetName(); rowIdPrefix is null for single-sheet mode.
*/
function parseSheetRows({ name, rawRows, year: sheetYear, month: sheetMonth, rowIdPrefix }, userBills, categories, defaultYear, defaultMonth) {
if (!rawRows.length) return { rows: [], headerRow: null };
const firstRow = rawRows[0] || [];
const headerMap = detectHeaders(firstRow);
const headerLabels = firstRow.map((c) => (c != null ? String(c).trim() : null));
const hasHeaders = Object.keys(headerMap).length > 0;
const startRow = hasHeaders ? 1 : 0;
const rows = [];
for (let i = startRow; i < rawRows.length; i++) {
const cells = rawRows[i] || [];
if (isBlankRow(cells)) continue;
if (isLikelyHeaderRow(cells) && i > 0) continue;
if (isLikelyTotalRow(cells)) continue;
rows.push(analyzeRow(
i, cells, headerMap, headerLabels, userBills, categories,
name, sheetYear, sheetMonth,
defaultYear, defaultMonth, rowIdPrefix,
));
}
return {
rows,
headerRow: hasHeaders ? firstRow.map((c) => (c != null ? String(c) : null)) : null,
};
}
// ─── Import Session Management ────────────────────────────────────────────────
function makeSessionId() {
return crypto.randomBytes(16).toString('hex');
}
function saveImportSession(db, userId, sessionData) {
const id = makeSessionId();
const now = new Date().toISOString();
const expiresAt = new Date(Date.now() + SESSION_TTL_MS).toISOString();
db.prepare(`
INSERT INTO import_sessions (id, user_id, created_at, expires_at, preview_json)
VALUES (?, ?, ?, ?, ?)
`).run(id, userId, now, expiresAt, JSON.stringify(sessionData));
return id;
}
function loadImportSession(db, sessionId, userId) {
const row = db.prepare(`
SELECT preview_json FROM import_sessions
WHERE id = ? AND user_id = ? AND expires_at > datetime('now')
`).get(sessionId, userId);
if (!row) {
const err = new Error('Import session not found or expired. Please re-upload your file.');
err.status = 404;
throw err;
}
return JSON.parse(row.preview_json);
}
function deleteImportSession(db, sessionId) {
db.prepare('DELETE FROM import_sessions WHERE id = ?').run(sessionId);
}
function pruneExpiredSessions(db) {
db.prepare("DELETE FROM import_sessions WHERE expires_at <= datetime('now')").run();
}
// ─── Preview ──────────────────────────────────────────────────────────────────
async function previewSpreadsheet(userId, buffer, options = {}) {
const db = getDb();
pruneExpiredSessions(db);
ensureUserDefaultCategories(userId);
const workbook = parseXlsxBuffer(buffer);
const sheetNames = workbook.SheetNames;
const parseAll = options.parse_all_sheets === true;
const defaultYear = options.default_year ?? null;
const defaultMonth = options.default_month ?? null;
// Load user's active bills once for matching across all sheets
const userBills = db.prepare(`
SELECT b.id, b.name, b.category_id, b.due_day, b.expected_amount, b.autopay_enabled,
c.name AS category_name
FROM bills b
LEFT JOIN categories c ON c.id = b.category_id
WHERE b.active = 1 AND b.user_id = ?
ORDER BY b.name
`).all(userId);
const categories = db.prepare('SELECT id, name FROM categories WHERE user_id = ? ORDER BY name').all(userId);
// ── Multi-sheet mode ─────────────────────────────────────────────────────────
if (parseAll) {
const sheetsMeta = [];
let allRows = [];
let totalParsed = 0;
for (let si = 0; si < sheetNames.length; si++) {
const name = sheetNames[si];
const { year, month, is_non_month_sheet } = parseSheetName(name);
if (is_non_month_sheet) {
sheetsMeta.push({
name, detected_year: null, detected_month: null,
is_non_month_sheet: true, status: 'skipped', row_count: 0,
});
continue;
}
const rawRows = getSheetRows(workbook, name);
const rowIdPrefix = `s${si}`;
const { rows, headerRow } = parseSheetRows(
{ name, rawRows, year, month, rowIdPrefix },
userBills, categories, defaultYear, defaultMonth,
);
totalParsed += rows.length;
if (totalParsed > MAX_ROWS) {
const err = new Error(`Spreadsheet too large. Combined rows across all sheets exceed ${MAX_ROWS}.`);
err.status = 400;
throw err;
}
const resolvedYear = year ?? (month ? defaultYear : null);
let status;
if (!month) {
status = 'ambiguous';
} else if (resolvedYear) {
status = 'parsed';
} else {
status = 'parsed_month_only';
}
sheetsMeta.push({
name, detected_year: resolvedYear, detected_month: month,
is_non_month_sheet: false, status,
row_count: rows.length, header_row: headerRow,
});
allRows = allRows.concat(rows);
}
const sessionData = {
user_id: userId,
original_filename: options.original_filename || null,
sheet_name: 'multiple',
rows: allRows,
default_year: defaultYear,
default_month: defaultMonth,
created_at: new Date().toISOString(),
};
const importSessionId = saveImportSession(db, userId, sessionData);
return {
import_session_id: importSessionId,
workbook: {
sheet_names: sheetNames,
parse_mode: 'all_sheets',
sheets: sheetsMeta,
total_row_count: allRows.length,
},
rows: allRows,
};
}
// ── Single-sheet mode (existing behaviour) ────────────────────────────────
const selectedSheet =
options.sheet_name && sheetNames.includes(options.sheet_name)
? options.sheet_name
: sheetNames[0];
const rawRows = getSheetRows(workbook, selectedSheet);
if (rawRows.length === 0) {
const err = new Error('Selected sheet is empty.');
err.status = 400;
throw err;
}
if (rawRows.length > MAX_ROWS + 1) {
const err = new Error(`Spreadsheet too large. Maximum ${MAX_ROWS} data rows supported.`);
err.status = 400;
throw err;
}
const { year: sheetYear, month: sheetMonth } = parseSheetName(selectedSheet);
const { rows: previewRows, headerRow } = parseSheetRows(
{ name: selectedSheet, rawRows, year: sheetYear, month: sheetMonth, rowIdPrefix: null },
userBills, categories, defaultYear, defaultMonth,
);
const sessionData = {
user_id: userId,
original_filename: options.original_filename || null,
sheet_name: selectedSheet,
rows: previewRows,
default_year: defaultYear,
default_month: defaultMonth,
created_at: new Date().toISOString(),
};
const importSessionId = saveImportSession(db, userId, sessionData);
return {
import_session_id: importSessionId,
workbook: {
sheet_names: sheetNames,
selected_sheet: selectedSheet,
parse_mode: 'single_sheet',
row_count: previewRows.length,
header_row: headerRow,
},
rows: previewRows,
};
}
// ─── Apply ────────────────────────────────────────────────────────────────────
const VALID_ACTIONS = new Set([
'match_existing_bill',
'create_new_bill',
'skip_row',
'create_payment',
'update_monthly_state',
'add_monthly_note',
]);
function importValidationError(message, detail = {}) {
const err = new Error(message);
err.status = 400;
err.code = 'IMPORT_VALIDATION_ERROR';
err.details = [detail].filter(Boolean);
return err;
}
function decisionValidationError(idx, d, field, message) {
return importValidationError(`Decision [${idx}]: ${message}`, {
row_id: d?.row_id ?? null,
field,
message,
});
}
function coerceOptionalNumber(d, idx, field, { min = null, max = null } = {}) {
if (d[field] === undefined || d[field] === null || d[field] === '') return null;
const value = Number(d[field]);
if (!Number.isFinite(value)) {
throw decisionValidationError(idx, d, field, `${field} must be a finite number`);
}
if (min !== null && value < min) {
throw decisionValidationError(idx, d, field, `${field} must be at least ${min}`);
}
if (max !== null && value > max) {
throw decisionValidationError(idx, d, field, `${field} must be no greater than ${max}`);
}
d[field] = value;
return value;
}
function coerceOptionalInteger(d, idx, field, { min = null, max = null } = {}) {
if (d[field] === undefined || d[field] === null || d[field] === '') return null;
const value = Number(d[field]);
if (!Number.isInteger(value)) {
throw decisionValidationError(idx, d, field, `${field} must be an integer`);
}
if (min !== null && value < min) {
throw decisionValidationError(idx, d, field, `${field} must be at least ${min}`);
}
if (max !== null && value > max) {
throw decisionValidationError(idx, d, field, `${field} must be no greater than ${max}`);
}
d[field] = value;
return value;
}
function validateOptionalIsoDate(d, idx, field) {
if (d[field] === undefined || d[field] === null || d[field] === '') return null;
const value = String(d[field]).trim();
if (!/^\d{4}-\d{2}-\d{2}$/.test(value)) {
throw decisionValidationError(idx, d, field, `${field} must use YYYY-MM-DD format`);
}
d[field] = value;
return value;
}
function validateDecision(d, idx) {
if (!d.row_id)
throw decisionValidationError(idx, d, 'row_id', 'row_id is required');
if (!VALID_ACTIONS.has(d.action))
throw decisionValidationError(idx, d, 'action', `unsupported action '${d.action}'`);
if (d.action === 'create_new_bill' && !d.bill_name)
throw decisionValidationError(idx, d, 'bill_name', 'bill_name is required for create_new_bill');
if (['match_existing_bill', 'update_monthly_state', 'add_monthly_note', 'create_payment'].includes(d.action)) {
if (d.bill_id === undefined || d.bill_id === null || d.bill_id === '')
throw decisionValidationError(idx, d, 'bill_id', `bill_id is required for ${d.action}`);
coerceOptionalInteger(d, idx, 'bill_id', { min: 1 });
}
coerceOptionalInteger(d, idx, 'due_day', { min: 1, max: 31 });
coerceOptionalInteger(d, idx, 'year', { min: 2000, max: 2100 });
coerceOptionalInteger(d, idx, 'month', { min: 1, max: 12 });
coerceOptionalInteger(d, idx, 'category_id', { min: 1 });
coerceOptionalNumber(d, idx, 'actual_amount', { min: 0 });
coerceOptionalNumber(d, idx, 'expected_amount', { min: 0 });
coerceOptionalNumber(d, idx, 'payment_amount', { min: 0 });
validateOptionalIsoDate(d, idx, 'payment_date');
if (d.action === 'create_payment' && !d.payment_amount && !d.actual_amount) {
throw decisionValidationError(idx, d, 'payment_amount', 'payment_amount or actual_amount is required for create_payment');
}
}
function resolveYear(decision, previewRow, sessionData) {
return decision.year ?? previewRow?.detected_year ?? sessionData.default_year ?? null;
}
function resolveMonth(decision, previewRow, sessionData) {
return decision.month ?? previewRow?.detected_month ?? sessionData.default_month ?? null;
}
function upsertMonthlyState(db, billId, year, month, amount, notes, isSkipped, allowOverwrite) {
const existing = db.prepare(`
SELECT id, actual_amount, notes, is_skipped
FROM monthly_bill_state
WHERE bill_id = ? AND year = ? AND month = ?
`).get(billId, year, month);
if (!existing) {
db.prepare(`
INSERT INTO monthly_bill_state (bill_id, year, month, actual_amount, notes, is_skipped)
VALUES (?, ?, ?, ?, ?, ?)
`).run(billId, year, month, amount, notes, isSkipped ?? 0);
return { result: 'created' };
}
const amountConflict = (amount !== null && existing.actual_amount !== null && existing.actual_amount !== amount);
const notesConflict = (notes !== null && existing.notes !== null && existing.notes !== notes);
if ((amountConflict || notesConflict) && !allowOverwrite) {
return { result: 'skipped_conflict', note: 'Monthly state already exists with different values — use overwrite:true to replace' };
}
const newAmount = allowOverwrite ? amount : (existing.actual_amount !== null ? existing.actual_amount : amount);
const newNotes = allowOverwrite ? notes : (existing.notes !== null ? existing.notes : notes);
const newSkipped = isSkipped !== null ? isSkipped : existing.is_skipped;
db.prepare(`
UPDATE monthly_bill_state
SET actual_amount = ?, notes = ?, is_skipped = ?, updated_at = datetime('now')
WHERE bill_id = ? AND year = ? AND month = ?
`).run(newAmount, newNotes, newSkipped, billId, year, month);
return { result: allowOverwrite ? 'overwritten' : 'updated' };
}
function createPaymentFromImport(db, billId, amount, paidDate, notes, allowOverwrite) {
if (!paidDate || amount == null || amount <= 0) return null;
const dup = db.prepare(`
SELECT id FROM payments
WHERE bill_id = ? AND paid_date = ? AND amount = ? AND deleted_at IS NULL
`).get(billId, paidDate, amount);
if (dup && !allowOverwrite) return 'skipped_duplicate';
db.prepare(`
INSERT INTO payments (bill_id, amount, paid_date, method, notes)
VALUES (?, ?, ?, ?, ?)
`).run(billId, amount, paidDate, null, notes);
return 'created';
}
function resolvePaymentInfo(decision, previewRow, amount) {
const paymentDate = decision.payment_date ?? previewRow?.detected_paid_date ?? null;
const paymentAmount = decision.payment_amount
?? previewRow?.detected_payment_amount
?? (paymentDate ? amount : null);
return { paymentDate, paymentAmount };
}
function importRelatedPaidMonthsForNewBill(db, newBillId, billName, sourceRowId, sessionData, appliedRowIds, allowOverwrite, summary) {
const targetName = normalizeName(billName);
if (!targetName) return 0;
let imported = 0;
for (const row of sessionData.rows || []) {
if (row.row_id === sourceRowId || appliedRowIds.has(row.row_id)) continue;
if (normalizeName(row.detected_bill_name || '') !== targetName) continue;
if (!row.detected_year || !row.detected_month) continue;
const amount = row.detected_amount ?? null;
const notes = row.detected_notes ?? null;
const paymentDate = row.detected_paid_date ?? null;
const paymentAmount = row.detected_payment_amount ?? (paymentDate ? amount : null);
if (amount == null && !paymentDate) continue;
const detail = { row_id: row.row_id, action: 'create_new_bill_related_month', result: 'imported', bill_id: newBillId };
if (amount != null) {
const st = upsertMonthlyState(db, newBillId, row.detected_year, row.detected_month, amount, notes, null, allowOverwrite);
detail.monthly_state = st.result;
}
const paymentResult = createPaymentFromImport(db, newBillId, paymentAmount, paymentDate, notes, allowOverwrite);
if (paymentResult) {
detail.payment = paymentResult;
detail.paid_date = paymentDate;
detail.payment_amount = paymentAmount;
}
summary.created++;
if (summary.skipped > 0) summary.skipped--;
summary.details.push(detail);
imported++;
}
return imported;
}
function applyOneDecision(db, userId, decision, previewRow, sessionData, allowOverwrite, summary, appliedRowIds) {
const { row_id, action } = decision;
if (action === 'skip_row') {
summary.skipped++;
summary.details.push({ row_id, action, result: 'skipped' });
return;
}
try {
const year = resolveYear(decision, previewRow, sessionData);
const month = resolveMonth(decision, previewRow, sessionData);
const amount = decision.actual_amount ?? previewRow?.detected_amount ?? null;
const notes = decision.notes ?? previewRow?.detected_notes ?? null;
if (action === 'create_new_bill') {
const name = (decision.bill_name || previewRow?.detected_bill_name || '').trim();
if (!name) throw new Error('Bill name required');
const existing = db.prepare(
"SELECT id FROM bills WHERE name = ? COLLATE NOCASE AND active = 1 AND user_id = ?",
).get(name, userId);
if (existing && !allowOverwrite) {
summary.skipped++;
summary.details.push({ row_id, action, result: 'skipped_duplicate', note: `Bill "${name}" already exists (id=${existing.id})` });
return;
}
let categoryId = decision.category_id ?? null;
if (categoryId) {
const cat = db.prepare('SELECT id FROM categories WHERE id = ? AND user_id = ?').get(categoryId, userId);
categoryId = cat?.id ?? null;
}
const catName = categoryId ? null : (decision.category || previewRow?.detected_category);
if (!categoryId && catName) {
const cat = db.prepare('SELECT id FROM categories WHERE name = ? COLLATE NOCASE AND user_id = ?').get(catName, userId);
categoryId = cat?.id ?? null;
}
const dueDay = decision.due_day ?? 1;
const expectedAmount = decision.expected_amount ?? amount ?? 0;
const ins = db.prepare(`
INSERT INTO bills (user_id, name, category_id, due_day, bucket, expected_amount, billing_cycle, active)
VALUES (?, ?, ?, ?, ?, ?, 'monthly', 1)
`).run(userId, name, categoryId, dueDay, dueDay <= 14 ? '1st' : '15th', expectedAmount);
const newBillId = ins.lastInsertRowid;
summary.created++;
const detail = { row_id, action, result: 'created', bill_id: newBillId, bill_name: name };
if (year && month) {
const st = upsertMonthlyState(db, newBillId, year, month, amount, notes, null, allowOverwrite);
detail.monthly_state = st.result;
}
const { paymentDate, paymentAmount } = resolvePaymentInfo(decision, previewRow, amount);
const paymentResult = createPaymentFromImport(
db,
newBillId,
paymentAmount,
paymentDate,
decision.payment_notes ?? notes,
allowOverwrite,
);
if (paymentResult) {
detail.payment = paymentResult;
detail.paid_date = paymentDate;
detail.payment_amount = paymentAmount;
}
const relatedImported = importRelatedPaidMonthsForNewBill(
db,
newBillId,
name,
row_id,
sessionData,
appliedRowIds,
allowOverwrite,
summary,
);
if (relatedImported > 0) detail.related_months_imported = relatedImported;
summary.details.push(detail);
} else if (['match_existing_bill', 'update_monthly_state', 'add_monthly_note'].includes(action)) {
const billId = decision.bill_id;
const bill = db.prepare('SELECT id, name FROM bills WHERE id = ? AND active = 1 AND user_id = ?').get(billId, userId);
if (!bill) throw new Error(`Bill id=${billId} not found or inactive`);
if (!year || !month) {
summary.ambiguous++;
summary.details.push({ row_id, action, result: 'ambiguous', error: 'year and month required for monthly state' });
return;
}
const noteToStore = action === 'add_monthly_note' ? (decision.notes ?? previewRow?.detected_notes ?? null) : notes;
const amountToStore = action === 'add_monthly_note' ? null : amount;
const isSkipped = decision.is_skipped ?? null;
const st = upsertMonthlyState(db, billId, year, month, amountToStore, noteToStore, isSkipped, allowOverwrite);
if (st.result === 'skipped_conflict') { summary.skipped++; }
else if (st.result === 'created') { summary.created++; }
else { summary.updated++; }
const detail = { row_id, action, result: st.result, bill_id: billId };
if (st.note) detail.note = st.note;
const { paymentDate, paymentAmount } = resolvePaymentInfo(decision, previewRow, amount);
if (paymentDate && paymentAmount != null && paymentAmount > 0) {
const paymentResult = createPaymentFromImport(
db,
billId,
paymentAmount,
paymentDate,
decision.payment_notes ?? noteToStore,
allowOverwrite,
);
if (paymentResult) {
detail.payment = paymentResult;
detail.paid_date = paymentDate;
detail.payment_amount = paymentAmount;
}
}
summary.details.push(detail);
} else if (action === 'create_payment') {
const billId = decision.bill_id;
const bill = db.prepare('SELECT id FROM bills WHERE id = ? AND active = 1 AND user_id = ?').get(billId, userId);
if (!bill) throw new Error(`Bill id=${billId} not found or inactive`);
const payAmount = decision.payment_amount ?? amount;
const payDate = decision.payment_date
?? (year && month ? `${year}-${String(month).padStart(2, '0')}-01` : null);
if (!payAmount || !payDate) {
summary.ambiguous++;
summary.details.push({ row_id, action, result: 'ambiguous', error: 'payment_amount and payment_date required' });
return;
}
const dup = db.prepare(`
SELECT id FROM payments
WHERE bill_id = ? AND paid_date = ? AND amount = ? AND deleted_at IS NULL
`).get(billId, payDate, payAmount);
if (dup && !allowOverwrite) {
summary.skipped++;
summary.details.push({ row_id, action, result: 'skipped_duplicate', note: 'Identical payment already exists' });
return;
}
db.prepare(`
INSERT INTO payments (bill_id, amount, paid_date, method, notes)
VALUES (?, ?, ?, ?, ?)
`).run(billId, payAmount, payDate, decision.payment_method ?? null, decision.payment_notes ?? null);
summary.created++;
summary.details.push({ row_id, action, result: 'created', bill_id: billId, paid_date: payDate, amount: payAmount });
}
} catch (err) {
summary.errored++;
summary.details.push({ row_id, action, result: 'error', error: err.message });
}
}
async function applyImportDecisions(userId, importSessionId, decisions, opts = {}) {
const db = getDb();
ensureUserDefaultCategories(userId);
const sessionData = loadImportSession(db, importSessionId, userId);
const rowLookup = Object.fromEntries(sessionData.rows.map((r) => [r.row_id, r]));
const appliedRowIds = new Set(decisions.map((d) => d.row_id));
for (let i = 0; i < decisions.length; i++) {
validateDecision(decisions[i], i);
if (!rowLookup[decisions[i].row_id]) {
throw decisionValidationError(i, decisions[i], 'row_id', `row_id '${decisions[i].row_id}' was not found in this import session`);
}
}
const allowOverwrite = opts.overwrite === true;
const reviewedSkippedCount = Number.isInteger(Number(opts.reviewed_skipped_count))
? Math.max(0, Number(opts.reviewed_skipped_count))
: 0;
const summary = { created: 0, updated: 0, skipped: reviewedSkippedCount, errored: 0, ambiguous: 0, details: [] };
const applyAll = db.transaction(() => {
for (const decision of decisions) {
applyOneDecision(db, userId, decision, rowLookup[decision.row_id], sessionData, allowOverwrite, summary, appliedRowIds);
}
});
applyAll();
try {
db.prepare(`
INSERT INTO import_history (
user_id, imported_at, source_filename, file_type, sheet_name,
rows_parsed, rows_created, rows_updated, rows_skipped, rows_ambiguous,
rows_errored, options_json, summary_json
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`).run(
userId,
new Date().toISOString(),
sessionData.original_filename,
'xlsx',
sessionData.sheet_name,
sessionData.rows.length,
summary.created,
summary.updated,
summary.skipped,
summary.ambiguous,
summary.errored,
JSON.stringify(opts),
JSON.stringify(summary.details),
);
} catch (histErr) {
console.error('[import] history write failed:', histErr.message);
}
deleteImportSession(db, importSessionId);
return {
success: true,
rows_parsed: sessionData.rows.length,
rows_created: summary.created,
rows_updated: summary.updated,
rows_skipped: summary.skipped,
rows_ambiguous: summary.ambiguous,
rows_errored: summary.errored,
details: summary.details,
};
}
// ─── Import History ───────────────────────────────────────────────────────────
function getImportHistory(userId) {
const db = getDb();
return db.prepare(`
SELECT id, imported_at, source_filename, file_type, sheet_name,
rows_parsed, rows_created, rows_updated, rows_skipped,
rows_ambiguous, rows_errored
FROM import_history
WHERE user_id = ?
ORDER BY imported_at DESC
LIMIT 100
`).all(userId);
}
module.exports = {
previewSpreadsheet,
applyImportDecisions,
getImportHistory,
// Exported for testing
parseSheetName,
resolveYearMonth,
parseAmount,
parseDate,
normalizeName,
findBillMatches,
buildRecommendation,
};