fix: use Unicode-aware SQLite LIKE filtering (#2903)

* fix: use Unicode-aware SQLite filtering

* Add release notes, fix type check

* Fix code styling
This commit is contained in:
dymanoid
2024-07-04 17:24:58 +02:00
committed by GitHub
parent d008944022
commit 710d9ab8ac
7 changed files with 128 additions and 4 deletions

View File

@@ -4,6 +4,8 @@ import { v4 as uuidv4 } from 'uuid';
import { removeFile, readFile } from '../fs';
import { unicodeLike } from './unicodeLike';
function verifyParamTypes(sql, arr) {
arr.forEach(val => {
if (typeof val !== 'string' && typeof val !== 'number' && val !== null) {
@@ -101,7 +103,7 @@ function regexp(regex: string, text: string | null) {
export function openDatabase(pathOrBuffer: string | Buffer) {
const db = new SQL(pathOrBuffer);
// Define Unicode-aware LOWER and UPPER implementation.
// Define Unicode-aware LOWER, UPPER, and LIKE implementation.
// This is necessary because better-sqlite3 uses SQLite build without ICU support.
// @ts-expect-error @types/better-sqlite3 does not support setting strict 3rd argument
db.function('UNICODE_LOWER', { deterministic: true }, (arg: string | null) =>
@@ -112,6 +114,8 @@ export function openDatabase(pathOrBuffer: string | Buffer) {
arg?.toUpperCase(),
);
// @ts-expect-error @types/better-sqlite3 does not support setting strict 3rd argument
db.function('UNICODE_LIKE', { deterministic: true }, unicodeLike);
// @ts-expect-error @types/better-sqlite3 does not support setting strict 3rd argument
db.function('REGEXP', { deterministic: true }, regexp);
return db;
}

View File

@@ -1,6 +1,8 @@
// @ts-strict-ignore
import initSqlJS, { type SqlJsStatic, type Database } from '@jlongster/sql.js';
import { unicodeLike } from './unicodeLike';
let SQL: SqlJsStatic | null = null;
export async function init() {
@@ -193,7 +195,7 @@ export async function openDatabase(pathOrBuffer?: string | Buffer) {
db = new SQL.Database();
}
// Define Unicode-aware LOWER and UPPER implementation.
// Define Unicode-aware LOWER, UPPER, and LIKE implementation.
// This is necessary because sql.js uses SQLite build without ICU support.
//
// Note that this function should ideally be created with a deterministic flag
@@ -201,6 +203,7 @@ export async function openDatabase(pathOrBuffer?: string | Buffer) {
// but SQL.js does not support this: https://github.com/sql-js/sql.js/issues/551
db.create_function('UNICODE_LOWER', arg => arg?.toLowerCase());
db.create_function('UNICODE_UPPER', arg => arg?.toUpperCase());
db.create_function('UNICODE_LIKE', unicodeLike);
db.create_function('REGEXP', regexp);
return db;
}

View File

@@ -0,0 +1,57 @@
import { unicodeLike } from './unicodeLike';
describe('unicode LIKE functionality', () => {
it('empty pattern should not match to a value', () => {
const result = unicodeLike(null, 'value');
expect(result).toBe(0);
});
it('empty pattern should not match to null', () => {
const result = unicodeLike(null, null);
expect(result).toBe(0);
});
it('should match special characters', () => {
const result = unicodeLike('.*+^${}()|[]\\', '.*+^${}()|[]\\');
expect(result).toBe(1);
});
it('should use ? as the single character placeholder', () => {
const result = unicodeLike('t?st', 'test');
expect(result).toBe(1);
});
it('should use % as the zero-or-more characters placeholder', () => {
const result = unicodeLike('t%st', 'te123st');
expect(result).toBe(1);
});
it('should ignore case for unicode', () => {
const result = unicodeLike('á', 'Ábcdefg');
expect(result).toBe(1);
});
it('should ignore case for ascii', () => {
const result = unicodeLike('a', 'Abcdefg');
expect(result).toBe(1);
});
it('should treat null value as empty string', () => {
const result = unicodeLike('%', null);
expect(result).toBe(1);
});
it('should not match null value to the string “null”', () => {
const result = unicodeLike('null', null);
expect(result).toBe(0);
});
});

View File

@@ -0,0 +1,32 @@
// @ts-strict-ignore
import LRU from 'lru-cache';
const likePatternCache = new LRU({ max: 500 });
export function unicodeLike(
pattern: string | null,
value: string | null,
): number {
if (!pattern) {
return 0;
}
if (!value) {
value = '';
}
let cachedRegExp = likePatternCache.get(pattern);
if (!cachedRegExp) {
// we don't escape ? and % because we don't know
// whether they originate from the user input or from our query compiler.
// Maybe improve the query compiler to correctly process these characters?
const processedPattern = pattern
.replace(/[.*+^${}()|[\]\\]/g, '\\$&')
.replaceAll('?', '.')
.replaceAll('%', '.*');
cachedRegExp = new RegExp(processedPattern, 'i');
likePatternCache.set(pattern, cachedRegExp);
}
return cachedRegExp.test(value) ? 1 : 0;
}

View File

@@ -123,6 +123,28 @@ describe('sheet language', () => {
);
});
it('`like` should use unicode function', () => {
const result = generateSQLWithState(
q('transactions')
.select('payee')
.filter({ 'payee.name': { $like: `%TEST%` } })
.serialize(),
schemaWithRefs,
);
expect(result.sql).toMatch(`UNICODE_LIKE('%TEST%', payees1.name)`);
});
it('`notlike` should use unicode function', () => {
const result = generateSQLWithState(
q('transactions')
.select('payee')
.filter({ 'payee.name': { $notlike: `%TEST%` } })
.serialize(),
schemaWithRefs,
);
expect(result.sql).toMatch(`NOT UNICODE_LIKE('%TEST%', payees1.name)`);
});
it('`select` allows nested functions', () => {
const result = generateSQLWithState(
q('transactions')

View File

@@ -720,7 +720,7 @@ const compileOp = saveStack('op', (state, fieldRef, opData) => {
}
case '$like': {
const [left, right] = valArray(state, [lhs, rhs], ['string', 'string']);
return `${left} LIKE ${right}`;
return `UNICODE_LIKE(${right}, ${left})`;
}
case '$regexp': {
const [left, right] = valArray(state, [lhs, rhs], ['string', 'string']);
@@ -728,7 +728,7 @@ const compileOp = saveStack('op', (state, fieldRef, opData) => {
}
case '$notlike': {
const [left, right] = valArray(state, [lhs, rhs], ['string', 'string']);
return `(${left} NOT LIKE ${right}\n OR ${left} IS NULL)`;
return `(NOT UNICODE_LIKE(${right}, ${left})\n OR ${left} IS NULL)`;
}
default:
throw new CompileError(`Unknown operator: ${op}`);

View File

@@ -0,0 +1,6 @@
---
category: Bugfix
authors: [dymanoid]
---
Use Unicode-aware database queries for filtering and searching.