Files
illusory-mapp/packages/logic/core/string.utils/sequence.matcher.ts
2026-02-28 14:50:04 +02:00

556 lines
12 KiB
TypeScript

/**
* Similar to Python's difflib.SequenceMatcher
*
* A flexible class for comparing pairs of sequences of any type.
* Uses the Ratcliff-Obershelp algorithm with "gestalt pattern matching"
* to find the longest contiguous matching subsequences.
*/
export interface Match {
/** Starting position in sequence a */
a: number;
/** Starting position in sequence b */
b: number;
/** Length of the matching block */
size: number;
}
export type OpCode = "replace" | "delete" | "insert" | "equal";
export interface OpCodeTuple {
/** Operation type */
tag: OpCode;
/** Start index in sequence a */
i1: number;
/** End index in sequence a */
i2: number;
/** Start index in sequence b */
j1: number;
/** End index in sequence b */
j2: number;
}
export type JunkFunction<T> = (element: T) => boolean;
export class SequenceMatcher<T> {
private isjunk: JunkFunction<T> | null;
private a: T[];
private b: T[];
private autojunk: boolean;
// Cached data structures for sequence b
private bjunk: Set<T>;
private bpopular: Set<T>;
private b2j: Map<T, number[]>;
// Cached results
private fullbcount: Map<T, number> | null = null;
private matchingBlocks: Match[] | null = null;
private opcodes: OpCodeTuple[] | null = null;
constructor(
isjunk: JunkFunction<T> | null = null,
a: T[] = [],
b: T[] = [],
autojunk: boolean = true,
) {
this.isjunk = isjunk;
this.a = [];
this.b = [];
this.autojunk = autojunk;
this.bjunk = new Set();
this.bpopular = new Set();
this.b2j = new Map();
this.setSeqs(a, b);
}
/**
* Set both sequences to be compared
*/
setSeqs(a: T[], b: T[]): void {
this.setSeq1(a);
this.setSeq2(b);
}
/**
* Set the first sequence to be compared
*/
setSeq1(a: T[]): void {
if (a === this.a) return;
this.a = [...a];
this.matchingBlocks = null;
this.opcodes = null;
}
/**
* Set the second sequence to be compared
*/
setSeq2(b: T[]): void {
if (b === this.b) return;
this.b = [...b];
this.matchingBlocks = null;
this.opcodes = null;
this.fullbcount = null;
this.chainB();
}
/**
* Analyze sequence b and build lookup structures
*/
private chainB(): void {
const b = this.b;
this.bjunk = new Set();
this.bpopular = new Set();
this.b2j = new Map();
// Count occurrences of each element
const elementCounts = new Map<T, number>();
for (const element of b) {
elementCounts.set(element, (elementCounts.get(element) || 0) + 1);
}
// Determine junk and popular elements
const n = b.length;
const popularThreshold = Math.floor(n / 100) + 1; // > 1% of sequence
for (const [element, count] of elementCounts) {
if (this.isjunk && this.isjunk(element)) {
this.bjunk.add(element);
} else if (this.autojunk && n >= 200 && count > popularThreshold) {
this.bpopular.add(element);
}
}
// Build position mapping for non-junk, non-popular elements
for (let i = 0; i < b.length; i++) {
const element = b[i];
if (!this.bjunk.has(element) && !this.bpopular.has(element)) {
if (!this.b2j.has(element)) {
this.b2j.set(element, []);
}
this.b2j.get(element)!.push(i);
}
}
}
/**
* Find the longest matching block in a[alo:ahi] and b[blo:bhi]
*/
findLongestMatch(
alo: number = 0,
ahi: number | null = null,
blo: number = 0,
bhi: number | null = null,
): Match {
if (ahi === null) ahi = this.a.length;
if (bhi === null) bhi = this.b.length;
let besti = alo;
let bestj = blo;
let bestsize = 0;
// Find all positions where a[i] appears in b
const j2len = new Map<number, number>();
for (let i = alo; i < ahi; i++) {
const element = this.a[i];
const positions = this.b2j.get(element) || [];
const newj2len = new Map<number, number>();
for (const j of positions) {
if (j < blo) continue;
if (j >= bhi) break;
const prevLen = j2len.get(j - 1) || 0;
const k = prevLen + 1;
newj2len.set(j, k);
if (k > bestsize) {
besti = i - k + 1;
bestj = j - k + 1;
bestsize = k;
}
}
j2len.clear();
for (const [key, value] of newj2len) {
j2len.set(key, value);
}
}
// Extend match with junk elements
while (
besti > alo &&
bestj > blo &&
!this.isBJunk(this.b[bestj - 1]) &&
this.elementsEqual(this.a[besti - 1], this.b[bestj - 1])
) {
besti--;
bestj--;
bestsize++;
}
while (
besti + bestsize < ahi &&
bestj + bestsize < bhi &&
!this.isBJunk(this.b[bestj + bestsize]) &&
this.elementsEqual(this.a[besti + bestsize], this.b[bestj + bestsize])
) {
bestsize++;
}
// Extend match with junk elements at the beginning
while (besti > alo && bestj > blo && this.isBJunk(this.b[bestj - 1])) {
besti--;
bestj--;
bestsize++;
}
// Extend match with junk elements at the end
while (
besti + bestsize < ahi &&
bestj + bestsize < bhi &&
this.isBJunk(this.b[bestj + bestsize])
) {
bestsize++;
}
return { a: besti, b: bestj, size: bestsize };
}
/**
* Return list of non-overlapping matching blocks
*/
getMatchingBlocks(): Match[] {
if (this.matchingBlocks !== null) {
return this.matchingBlocks;
}
const matches: Match[] = [];
this.getMatchingBlocksRecursive(
0,
this.a.length,
0,
this.b.length,
matches,
);
// Add sentinel
matches.push({ a: this.a.length, b: this.b.length, size: 0 });
this.matchingBlocks = matches;
return matches;
}
/**
* Recursively find matching blocks
*/
private getMatchingBlocksRecursive(
alo: number,
ahi: number,
blo: number,
bhi: number,
matches: Match[],
): void {
const match = this.findLongestMatch(alo, ahi, blo, bhi);
if (match.size > 0) {
// Recurse on the pieces before and after the match
if (alo < match.a && blo < match.b) {
this.getMatchingBlocksRecursive(
alo,
match.a,
blo,
match.b,
matches,
);
}
matches.push(match);
if (match.a + match.size < ahi && match.b + match.size < bhi) {
this.getMatchingBlocksRecursive(
match.a + match.size,
ahi,
match.b + match.size,
bhi,
matches,
);
}
}
}
/**
* Return list of 5-tuples describing how to turn a into b
*/
getOpcodes(): OpCodeTuple[] {
if (this.opcodes !== null) {
return this.opcodes;
}
let i = 0;
let j = 0;
const opcodes: OpCodeTuple[] = [];
for (const match of this.getMatchingBlocks()) {
let tag: OpCode = "equal";
if (i < match.a && j < match.b) {
tag = "replace";
} else if (i < match.a) {
tag = "delete";
} else if (j < match.b) {
tag = "insert";
}
if (tag !== "equal") {
opcodes.push({
tag,
i1: i,
i2: match.a,
j1: j,
j2: match.b,
});
}
i = match.a + match.size;
j = match.b + match.size;
// Don't add the sentinel match
if (match.size > 0) {
opcodes.push({
tag: "equal",
i1: match.a,
i2: i,
j1: match.b,
j2: j,
});
}
}
this.opcodes = opcodes;
return opcodes;
}
/**
* Return a measure of sequences' similarity (0.0-1.0)
*/
ratio(): number {
const matches = this.getMatchingBlocks()
.slice(0, -1) // Exclude sentinel
.reduce((sum, match) => sum + match.size, 0);
const total = this.a.length + this.b.length;
return total === 0 ? 1.0 : (2.0 * matches) / total;
}
/**
* Return an upper bound on ratio() relatively quickly
*/
quickRatio(): number {
if (this.fullbcount === null) {
this.fullbcount = new Map();
for (const element of this.b) {
this.fullbcount.set(
element,
(this.fullbcount.get(element) || 0) + 1,
);
}
}
let matches = 0;
const tempCounts = new Map(this.fullbcount);
for (const element of this.a) {
const count = tempCounts.get(element);
if (count && count > 0) {
matches++;
tempCounts.set(element, count - 1);
}
}
const total = this.a.length + this.b.length;
return total === 0 ? 1.0 : (2.0 * matches) / total;
}
/**
* Return an upper bound on ratio() very quickly
*/
realQuickRatio(): number {
const total = this.a.length + this.b.length;
return total === 0
? 1.0
: (2.0 * Math.min(this.a.length, this.b.length)) / total;
}
/**
* Check if element is junk in sequence b
*/
private isBJunk(element: T): boolean {
return this.bjunk.has(element);
}
/**
* Check if two elements are equal
*/
private elementsEqual(a: T, b: T): boolean {
return a === b;
}
}
/**
* Utility function to get close matches similar to Python's get_close_matches
*/
export function getCloseMatches<T>(
word: T[],
possibilities: T[][],
n: number = 3,
cutoff: number = 0.6,
): T[][] {
if (n <= 0) {
throw new Error("n must be greater than 0");
}
const matches: Array<{ sequence: T[]; ratio: number }> = [];
for (const possibility of possibilities) {
const matcher = new SequenceMatcher(null, word, possibility);
const ratio = matcher.ratio();
if (ratio >= cutoff) {
matches.push({ sequence: possibility, ratio });
}
}
// Sort by ratio (descending) and take top n
matches.sort((a, b) => b.ratio - a.ratio);
return matches.slice(0, n).map((match) => match.sequence);
}
/**
* String-specific version of SequenceMatcher for character-by-character comparison.
* This class treats strings as sequences of characters while providing a string-friendly API.
*/
export class StringSequenceMatcher {
private matcher: SequenceMatcher<string>;
constructor(
isjunk: JunkFunction<string> | null = null,
a: string = "",
b: string = "",
autojunk: boolean = true,
) {
this.matcher = new SequenceMatcher(
isjunk,
Array.from(a),
Array.from(b),
autojunk,
);
}
/**
* Set both sequences to be compared
*/
setSeqs(a: string, b: string): void {
this.matcher.setSeqs(Array.from(a), Array.from(b));
}
/**
* Set the first sequence to be compared
*/
setSeq1(a: string): void {
this.matcher.setSeq1(Array.from(a));
}
/**
* Set the second sequence to be compared
*/
setSeq2(b: string): void {
this.matcher.setSeq2(Array.from(b));
}
/**
* Find the longest matching block in a[alo:ahi] and b[blo:bhi]
*/
findLongestMatch(
alo: number = 0,
ahi: number | null = null,
blo: number = 0,
bhi: number | null = null,
): Match {
return this.matcher.findLongestMatch(alo, ahi, blo, bhi);
}
/**
* Return list of non-overlapping matching blocks
*/
getMatchingBlocks(): Match[] {
return this.matcher.getMatchingBlocks();
}
/**
* Return list of 5-tuples describing how to turn a into b
*/
getOpcodes(): OpCodeTuple[] {
return this.matcher.getOpcodes();
}
/**
* Return a measure of sequences' similarity (0.0-1.0)
*/
ratio(): number {
return this.matcher.ratio();
}
/**
* Return an upper bound on ratio() relatively quickly
*/
quickRatio(): number {
return this.matcher.quickRatio();
}
/**
* Return an upper bound on ratio() very quickly
*/
realQuickRatio(): number {
return this.matcher.realQuickRatio();
}
}
/**
* Utility function for string similarity
*/
export function getStringSimilarity(a: string, b: string): number {
const matcher = new StringSequenceMatcher(null, a, b);
return matcher.ratio();
}
/**
* Get close string matches
*/
export function getCloseStringMatches(
word: string,
possibilities: string[],
n: number = 3,
cutoff: number = 0.6,
): string[] {
if (n <= 0) {
throw new Error("n must be greater than 0");
}
const matches: Array<{ string: string; ratio: number }> = [];
for (const possibility of possibilities) {
const ratio = getStringSimilarity(word, possibility);
if (ratio >= cutoff) {
matches.push({ string: possibility, ratio });
}
}
// Sort by ratio (descending) and take top n
matches.sort((a, b) => b.ratio - a.ratio);
return matches.slice(0, n).map((match) => match.string);
}