556 lines
12 KiB
TypeScript
556 lines
12 KiB
TypeScript
/**
|
|
* Similar to Python's difflib.SequenceMatcher
|
|
*
|
|
* A flexible class for comparing pairs of sequences of any type.
|
|
* Uses the Ratcliff-Obershelp algorithm with "gestalt pattern matching"
|
|
* to find the longest contiguous matching subsequences.
|
|
*/
|
|
|
|
export interface Match {
|
|
/** Starting position in sequence a */
|
|
a: number;
|
|
/** Starting position in sequence b */
|
|
b: number;
|
|
/** Length of the matching block */
|
|
size: number;
|
|
}
|
|
|
|
export type OpCode = "replace" | "delete" | "insert" | "equal";
|
|
|
|
export interface OpCodeTuple {
|
|
/** Operation type */
|
|
tag: OpCode;
|
|
/** Start index in sequence a */
|
|
i1: number;
|
|
/** End index in sequence a */
|
|
i2: number;
|
|
/** Start index in sequence b */
|
|
j1: number;
|
|
/** End index in sequence b */
|
|
j2: number;
|
|
}
|
|
|
|
export type JunkFunction<T> = (element: T) => boolean;
|
|
|
|
export class SequenceMatcher<T> {
|
|
private isjunk: JunkFunction<T> | null;
|
|
private a: T[];
|
|
private b: T[];
|
|
private autojunk: boolean;
|
|
|
|
// Cached data structures for sequence b
|
|
private bjunk: Set<T>;
|
|
private bpopular: Set<T>;
|
|
private b2j: Map<T, number[]>;
|
|
|
|
// Cached results
|
|
private fullbcount: Map<T, number> | null = null;
|
|
private matchingBlocks: Match[] | null = null;
|
|
private opcodes: OpCodeTuple[] | null = null;
|
|
|
|
constructor(
|
|
isjunk: JunkFunction<T> | null = null,
|
|
a: T[] = [],
|
|
b: T[] = [],
|
|
autojunk: boolean = true,
|
|
) {
|
|
this.isjunk = isjunk;
|
|
this.a = [];
|
|
this.b = [];
|
|
this.autojunk = autojunk;
|
|
this.bjunk = new Set();
|
|
this.bpopular = new Set();
|
|
this.b2j = new Map();
|
|
|
|
this.setSeqs(a, b);
|
|
}
|
|
|
|
/**
|
|
* Set both sequences to be compared
|
|
*/
|
|
setSeqs(a: T[], b: T[]): void {
|
|
this.setSeq1(a);
|
|
this.setSeq2(b);
|
|
}
|
|
|
|
/**
|
|
* Set the first sequence to be compared
|
|
*/
|
|
setSeq1(a: T[]): void {
|
|
if (a === this.a) return;
|
|
this.a = [...a];
|
|
this.matchingBlocks = null;
|
|
this.opcodes = null;
|
|
}
|
|
|
|
/**
|
|
* Set the second sequence to be compared
|
|
*/
|
|
setSeq2(b: T[]): void {
|
|
if (b === this.b) return;
|
|
this.b = [...b];
|
|
this.matchingBlocks = null;
|
|
this.opcodes = null;
|
|
this.fullbcount = null;
|
|
this.chainB();
|
|
}
|
|
|
|
/**
|
|
* Analyze sequence b and build lookup structures
|
|
*/
|
|
private chainB(): void {
|
|
const b = this.b;
|
|
this.bjunk = new Set();
|
|
this.bpopular = new Set();
|
|
this.b2j = new Map();
|
|
|
|
// Count occurrences of each element
|
|
const elementCounts = new Map<T, number>();
|
|
for (const element of b) {
|
|
elementCounts.set(element, (elementCounts.get(element) || 0) + 1);
|
|
}
|
|
|
|
// Determine junk and popular elements
|
|
const n = b.length;
|
|
const popularThreshold = Math.floor(n / 100) + 1; // > 1% of sequence
|
|
|
|
for (const [element, count] of elementCounts) {
|
|
if (this.isjunk && this.isjunk(element)) {
|
|
this.bjunk.add(element);
|
|
} else if (this.autojunk && n >= 200 && count > popularThreshold) {
|
|
this.bpopular.add(element);
|
|
}
|
|
}
|
|
|
|
// Build position mapping for non-junk, non-popular elements
|
|
for (let i = 0; i < b.length; i++) {
|
|
const element = b[i];
|
|
if (!this.bjunk.has(element) && !this.bpopular.has(element)) {
|
|
if (!this.b2j.has(element)) {
|
|
this.b2j.set(element, []);
|
|
}
|
|
this.b2j.get(element)!.push(i);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Find the longest matching block in a[alo:ahi] and b[blo:bhi]
|
|
*/
|
|
findLongestMatch(
|
|
alo: number = 0,
|
|
ahi: number | null = null,
|
|
blo: number = 0,
|
|
bhi: number | null = null,
|
|
): Match {
|
|
if (ahi === null) ahi = this.a.length;
|
|
if (bhi === null) bhi = this.b.length;
|
|
|
|
let besti = alo;
|
|
let bestj = blo;
|
|
let bestsize = 0;
|
|
|
|
// Find all positions where a[i] appears in b
|
|
const j2len = new Map<number, number>();
|
|
|
|
for (let i = alo; i < ahi; i++) {
|
|
const element = this.a[i];
|
|
const positions = this.b2j.get(element) || [];
|
|
const newj2len = new Map<number, number>();
|
|
|
|
for (const j of positions) {
|
|
if (j < blo) continue;
|
|
if (j >= bhi) break;
|
|
|
|
const prevLen = j2len.get(j - 1) || 0;
|
|
const k = prevLen + 1;
|
|
newj2len.set(j, k);
|
|
|
|
if (k > bestsize) {
|
|
besti = i - k + 1;
|
|
bestj = j - k + 1;
|
|
bestsize = k;
|
|
}
|
|
}
|
|
|
|
j2len.clear();
|
|
for (const [key, value] of newj2len) {
|
|
j2len.set(key, value);
|
|
}
|
|
}
|
|
|
|
// Extend match with junk elements
|
|
while (
|
|
besti > alo &&
|
|
bestj > blo &&
|
|
!this.isBJunk(this.b[bestj - 1]) &&
|
|
this.elementsEqual(this.a[besti - 1], this.b[bestj - 1])
|
|
) {
|
|
besti--;
|
|
bestj--;
|
|
bestsize++;
|
|
}
|
|
|
|
while (
|
|
besti + bestsize < ahi &&
|
|
bestj + bestsize < bhi &&
|
|
!this.isBJunk(this.b[bestj + bestsize]) &&
|
|
this.elementsEqual(this.a[besti + bestsize], this.b[bestj + bestsize])
|
|
) {
|
|
bestsize++;
|
|
}
|
|
|
|
// Extend match with junk elements at the beginning
|
|
while (besti > alo && bestj > blo && this.isBJunk(this.b[bestj - 1])) {
|
|
besti--;
|
|
bestj--;
|
|
bestsize++;
|
|
}
|
|
|
|
// Extend match with junk elements at the end
|
|
while (
|
|
besti + bestsize < ahi &&
|
|
bestj + bestsize < bhi &&
|
|
this.isBJunk(this.b[bestj + bestsize])
|
|
) {
|
|
bestsize++;
|
|
}
|
|
|
|
return { a: besti, b: bestj, size: bestsize };
|
|
}
|
|
|
|
/**
|
|
* Return list of non-overlapping matching blocks
|
|
*/
|
|
getMatchingBlocks(): Match[] {
|
|
if (this.matchingBlocks !== null) {
|
|
return this.matchingBlocks;
|
|
}
|
|
|
|
const matches: Match[] = [];
|
|
this.getMatchingBlocksRecursive(
|
|
0,
|
|
this.a.length,
|
|
0,
|
|
this.b.length,
|
|
matches,
|
|
);
|
|
|
|
// Add sentinel
|
|
matches.push({ a: this.a.length, b: this.b.length, size: 0 });
|
|
|
|
this.matchingBlocks = matches;
|
|
return matches;
|
|
}
|
|
|
|
/**
|
|
* Recursively find matching blocks
|
|
*/
|
|
private getMatchingBlocksRecursive(
|
|
alo: number,
|
|
ahi: number,
|
|
blo: number,
|
|
bhi: number,
|
|
matches: Match[],
|
|
): void {
|
|
const match = this.findLongestMatch(alo, ahi, blo, bhi);
|
|
|
|
if (match.size > 0) {
|
|
// Recurse on the pieces before and after the match
|
|
if (alo < match.a && blo < match.b) {
|
|
this.getMatchingBlocksRecursive(
|
|
alo,
|
|
match.a,
|
|
blo,
|
|
match.b,
|
|
matches,
|
|
);
|
|
}
|
|
|
|
matches.push(match);
|
|
|
|
if (match.a + match.size < ahi && match.b + match.size < bhi) {
|
|
this.getMatchingBlocksRecursive(
|
|
match.a + match.size,
|
|
ahi,
|
|
match.b + match.size,
|
|
bhi,
|
|
matches,
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return list of 5-tuples describing how to turn a into b
|
|
*/
|
|
getOpcodes(): OpCodeTuple[] {
|
|
if (this.opcodes !== null) {
|
|
return this.opcodes;
|
|
}
|
|
|
|
let i = 0;
|
|
let j = 0;
|
|
const opcodes: OpCodeTuple[] = [];
|
|
|
|
for (const match of this.getMatchingBlocks()) {
|
|
let tag: OpCode = "equal";
|
|
|
|
if (i < match.a && j < match.b) {
|
|
tag = "replace";
|
|
} else if (i < match.a) {
|
|
tag = "delete";
|
|
} else if (j < match.b) {
|
|
tag = "insert";
|
|
}
|
|
|
|
if (tag !== "equal") {
|
|
opcodes.push({
|
|
tag,
|
|
i1: i,
|
|
i2: match.a,
|
|
j1: j,
|
|
j2: match.b,
|
|
});
|
|
}
|
|
|
|
i = match.a + match.size;
|
|
j = match.b + match.size;
|
|
|
|
// Don't add the sentinel match
|
|
if (match.size > 0) {
|
|
opcodes.push({
|
|
tag: "equal",
|
|
i1: match.a,
|
|
i2: i,
|
|
j1: match.b,
|
|
j2: j,
|
|
});
|
|
}
|
|
}
|
|
|
|
this.opcodes = opcodes;
|
|
return opcodes;
|
|
}
|
|
|
|
/**
|
|
* Return a measure of sequences' similarity (0.0-1.0)
|
|
*/
|
|
ratio(): number {
|
|
const matches = this.getMatchingBlocks()
|
|
.slice(0, -1) // Exclude sentinel
|
|
.reduce((sum, match) => sum + match.size, 0);
|
|
|
|
const total = this.a.length + this.b.length;
|
|
return total === 0 ? 1.0 : (2.0 * matches) / total;
|
|
}
|
|
|
|
/**
|
|
* Return an upper bound on ratio() relatively quickly
|
|
*/
|
|
quickRatio(): number {
|
|
if (this.fullbcount === null) {
|
|
this.fullbcount = new Map();
|
|
for (const element of this.b) {
|
|
this.fullbcount.set(
|
|
element,
|
|
(this.fullbcount.get(element) || 0) + 1,
|
|
);
|
|
}
|
|
}
|
|
|
|
let matches = 0;
|
|
const tempCounts = new Map(this.fullbcount);
|
|
|
|
for (const element of this.a) {
|
|
const count = tempCounts.get(element);
|
|
if (count && count > 0) {
|
|
matches++;
|
|
tempCounts.set(element, count - 1);
|
|
}
|
|
}
|
|
|
|
const total = this.a.length + this.b.length;
|
|
return total === 0 ? 1.0 : (2.0 * matches) / total;
|
|
}
|
|
|
|
/**
|
|
* Return an upper bound on ratio() very quickly
|
|
*/
|
|
realQuickRatio(): number {
|
|
const total = this.a.length + this.b.length;
|
|
return total === 0
|
|
? 1.0
|
|
: (2.0 * Math.min(this.a.length, this.b.length)) / total;
|
|
}
|
|
|
|
/**
|
|
* Check if element is junk in sequence b
|
|
*/
|
|
private isBJunk(element: T): boolean {
|
|
return this.bjunk.has(element);
|
|
}
|
|
|
|
/**
|
|
* Check if two elements are equal
|
|
*/
|
|
private elementsEqual(a: T, b: T): boolean {
|
|
return a === b;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Utility function to get close matches similar to Python's get_close_matches
|
|
*/
|
|
export function getCloseMatches<T>(
|
|
word: T[],
|
|
possibilities: T[][],
|
|
n: number = 3,
|
|
cutoff: number = 0.6,
|
|
): T[][] {
|
|
if (n <= 0) {
|
|
throw new Error("n must be greater than 0");
|
|
}
|
|
|
|
const matches: Array<{ sequence: T[]; ratio: number }> = [];
|
|
|
|
for (const possibility of possibilities) {
|
|
const matcher = new SequenceMatcher(null, word, possibility);
|
|
const ratio = matcher.ratio();
|
|
|
|
if (ratio >= cutoff) {
|
|
matches.push({ sequence: possibility, ratio });
|
|
}
|
|
}
|
|
|
|
// Sort by ratio (descending) and take top n
|
|
matches.sort((a, b) => b.ratio - a.ratio);
|
|
return matches.slice(0, n).map((match) => match.sequence);
|
|
}
|
|
|
|
/**
|
|
* String-specific version of SequenceMatcher for character-by-character comparison.
|
|
* This class treats strings as sequences of characters while providing a string-friendly API.
|
|
*/
|
|
export class StringSequenceMatcher {
|
|
private matcher: SequenceMatcher<string>;
|
|
|
|
constructor(
|
|
isjunk: JunkFunction<string> | null = null,
|
|
a: string = "",
|
|
b: string = "",
|
|
autojunk: boolean = true,
|
|
) {
|
|
this.matcher = new SequenceMatcher(
|
|
isjunk,
|
|
Array.from(a),
|
|
Array.from(b),
|
|
autojunk,
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Set both sequences to be compared
|
|
*/
|
|
setSeqs(a: string, b: string): void {
|
|
this.matcher.setSeqs(Array.from(a), Array.from(b));
|
|
}
|
|
|
|
/**
|
|
* Set the first sequence to be compared
|
|
*/
|
|
setSeq1(a: string): void {
|
|
this.matcher.setSeq1(Array.from(a));
|
|
}
|
|
|
|
/**
|
|
* Set the second sequence to be compared
|
|
*/
|
|
setSeq2(b: string): void {
|
|
this.matcher.setSeq2(Array.from(b));
|
|
}
|
|
|
|
/**
|
|
* Find the longest matching block in a[alo:ahi] and b[blo:bhi]
|
|
*/
|
|
findLongestMatch(
|
|
alo: number = 0,
|
|
ahi: number | null = null,
|
|
blo: number = 0,
|
|
bhi: number | null = null,
|
|
): Match {
|
|
return this.matcher.findLongestMatch(alo, ahi, blo, bhi);
|
|
}
|
|
|
|
/**
|
|
* Return list of non-overlapping matching blocks
|
|
*/
|
|
getMatchingBlocks(): Match[] {
|
|
return this.matcher.getMatchingBlocks();
|
|
}
|
|
|
|
/**
|
|
* Return list of 5-tuples describing how to turn a into b
|
|
*/
|
|
getOpcodes(): OpCodeTuple[] {
|
|
return this.matcher.getOpcodes();
|
|
}
|
|
|
|
/**
|
|
* Return a measure of sequences' similarity (0.0-1.0)
|
|
*/
|
|
ratio(): number {
|
|
return this.matcher.ratio();
|
|
}
|
|
|
|
/**
|
|
* Return an upper bound on ratio() relatively quickly
|
|
*/
|
|
quickRatio(): number {
|
|
return this.matcher.quickRatio();
|
|
}
|
|
|
|
/**
|
|
* Return an upper bound on ratio() very quickly
|
|
*/
|
|
realQuickRatio(): number {
|
|
return this.matcher.realQuickRatio();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Utility function for string similarity
|
|
*/
|
|
export function getStringSimilarity(a: string, b: string): number {
|
|
const matcher = new StringSequenceMatcher(null, a, b);
|
|
return matcher.ratio();
|
|
}
|
|
|
|
/**
|
|
* Get close string matches
|
|
*/
|
|
export function getCloseStringMatches(
|
|
word: string,
|
|
possibilities: string[],
|
|
n: number = 3,
|
|
cutoff: number = 0.6,
|
|
): string[] {
|
|
if (n <= 0) {
|
|
throw new Error("n must be greater than 0");
|
|
}
|
|
|
|
const matches: Array<{ string: string; ratio: number }> = [];
|
|
|
|
for (const possibility of possibilities) {
|
|
const ratio = getStringSimilarity(word, possibility);
|
|
|
|
if (ratio >= cutoff) {
|
|
matches.push({ string: possibility, ratio });
|
|
}
|
|
}
|
|
|
|
// Sort by ratio (descending) and take top n
|
|
matches.sort((a, b) => b.ratio - a.ratio);
|
|
return matches.slice(0, n).map((match) => match.string);
|
|
}
|