& so it begins
This commit is contained in:
106
packages/logic/core/string.utils/index.ts
Normal file
106
packages/logic/core/string.utils/index.ts
Normal file
@@ -0,0 +1,106 @@
|
||||
import * as v from "valibot";
|
||||
|
||||
export function capitalize(input: string, firstOfAllWords?: boolean): string {
|
||||
// capitalize first letter of input
|
||||
if (!firstOfAllWords) {
|
||||
return input.charAt(0).toUpperCase() + input.slice(1);
|
||||
}
|
||||
let out = "";
|
||||
for (const word of input.split(" ")) {
|
||||
out += word.charAt(0).toUpperCase() + word.slice(1) + " ";
|
||||
}
|
||||
return out.slice(0, -1);
|
||||
}
|
||||
|
||||
export function camelToSpacedPascal(input: string): string {
|
||||
let result = "";
|
||||
let previousChar = "";
|
||||
for (const char of input) {
|
||||
if (char === char.toUpperCase() && previousChar !== " ") {
|
||||
result += " ";
|
||||
}
|
||||
result += char;
|
||||
previousChar = char;
|
||||
}
|
||||
return result.charAt(0).toUpperCase() + result.slice(1);
|
||||
}
|
||||
|
||||
export function snakeToCamel(input: string): string {
|
||||
if (!input) {
|
||||
return input;
|
||||
}
|
||||
// also account for numbers and kebab-case
|
||||
const splits = input.split(/[-_]/);
|
||||
let result = splits[0];
|
||||
for (const split of splits.slice(1)) {
|
||||
result += capitalize(split, true);
|
||||
}
|
||||
return result ?? "";
|
||||
}
|
||||
|
||||
export function snakeToSpacedPascal(input: string): string {
|
||||
return camelToSpacedPascal(snakeToCamel(input));
|
||||
}
|
||||
|
||||
export function spacedPascalToSnake(input: string): string {
|
||||
return input.split(" ").join("_").toLowerCase();
|
||||
}
|
||||
|
||||
export function convertDashedLowerToTitleCase(input: string): string {
|
||||
return input
|
||||
.split("-")
|
||||
.map(
|
||||
(word) =>
|
||||
word.charAt(0).toUpperCase() + word.slice(1).toLowerCase(),
|
||||
)
|
||||
.join(" "); // Join the words with a space
|
||||
}
|
||||
|
||||
export function encodeCursor<T>(cursor: T): string {
|
||||
try {
|
||||
// Convert the object to a JSON string
|
||||
const jsonString = JSON.stringify(cursor);
|
||||
// Convert to UTF-8 bytes, then base64
|
||||
return btoa(
|
||||
encodeURIComponent(jsonString).replace(/%([0-9A-F]{2})/g, (_, p1) =>
|
||||
String.fromCharCode(parseInt(p1, 16)),
|
||||
),
|
||||
);
|
||||
} catch (error) {
|
||||
console.error("Error encoding cursor:", error);
|
||||
throw new Error("Failed to encode cursor");
|
||||
}
|
||||
}
|
||||
|
||||
export function decodeCursor<T>(
|
||||
cursor: string,
|
||||
parser: v.BaseSchema<any, T, any>,
|
||||
) {
|
||||
try {
|
||||
// Decode base64 back to UTF-8 string
|
||||
const decoded = decodeURIComponent(
|
||||
Array.prototype.map
|
||||
.call(atob(cursor), (c) => {
|
||||
return (
|
||||
"%" + ("00" + c.charCodeAt(0).toString(16)).slice(-2)
|
||||
);
|
||||
})
|
||||
.join(""),
|
||||
);
|
||||
// Parse back to object
|
||||
const parsedData = JSON.parse(decoded);
|
||||
const result = v.safeParse(parser, parsedData);
|
||||
return result.success
|
||||
? { success: true, data: result.output as T }
|
||||
: {
|
||||
success: false,
|
||||
error: new Error(
|
||||
result.issues.map((i) => i.message).join(", "),
|
||||
),
|
||||
data: undefined,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error decoding cursor:", error);
|
||||
return { error: new Error("Failed to decode cursor"), data: undefined };
|
||||
}
|
||||
}
|
||||
555
packages/logic/core/string.utils/sequence.matcher.ts
Normal file
555
packages/logic/core/string.utils/sequence.matcher.ts
Normal file
@@ -0,0 +1,555 @@
|
||||
/**
|
||||
* Similar to Python's difflib.SequenceMatcher
|
||||
*
|
||||
* A flexible class for comparing pairs of sequences of any type.
|
||||
* Uses the Ratcliff-Obershelp algorithm with "gestalt pattern matching"
|
||||
* to find the longest contiguous matching subsequences.
|
||||
*/
|
||||
|
||||
export interface Match {
|
||||
/** Starting position in sequence a */
|
||||
a: number;
|
||||
/** Starting position in sequence b */
|
||||
b: number;
|
||||
/** Length of the matching block */
|
||||
size: number;
|
||||
}
|
||||
|
||||
export type OpCode = "replace" | "delete" | "insert" | "equal";
|
||||
|
||||
export interface OpCodeTuple {
|
||||
/** Operation type */
|
||||
tag: OpCode;
|
||||
/** Start index in sequence a */
|
||||
i1: number;
|
||||
/** End index in sequence a */
|
||||
i2: number;
|
||||
/** Start index in sequence b */
|
||||
j1: number;
|
||||
/** End index in sequence b */
|
||||
j2: number;
|
||||
}
|
||||
|
||||
export type JunkFunction<T> = (element: T) => boolean;
|
||||
|
||||
export class SequenceMatcher<T> {
|
||||
private isjunk: JunkFunction<T> | null;
|
||||
private a: T[];
|
||||
private b: T[];
|
||||
private autojunk: boolean;
|
||||
|
||||
// Cached data structures for sequence b
|
||||
private bjunk: Set<T>;
|
||||
private bpopular: Set<T>;
|
||||
private b2j: Map<T, number[]>;
|
||||
|
||||
// Cached results
|
||||
private fullbcount: Map<T, number> | null = null;
|
||||
private matchingBlocks: Match[] | null = null;
|
||||
private opcodes: OpCodeTuple[] | null = null;
|
||||
|
||||
constructor(
|
||||
isjunk: JunkFunction<T> | null = null,
|
||||
a: T[] = [],
|
||||
b: T[] = [],
|
||||
autojunk: boolean = true,
|
||||
) {
|
||||
this.isjunk = isjunk;
|
||||
this.a = [];
|
||||
this.b = [];
|
||||
this.autojunk = autojunk;
|
||||
this.bjunk = new Set();
|
||||
this.bpopular = new Set();
|
||||
this.b2j = new Map();
|
||||
|
||||
this.setSeqs(a, b);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set both sequences to be compared
|
||||
*/
|
||||
setSeqs(a: T[], b: T[]): void {
|
||||
this.setSeq1(a);
|
||||
this.setSeq2(b);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the first sequence to be compared
|
||||
*/
|
||||
setSeq1(a: T[]): void {
|
||||
if (a === this.a) return;
|
||||
this.a = [...a];
|
||||
this.matchingBlocks = null;
|
||||
this.opcodes = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the second sequence to be compared
|
||||
*/
|
||||
setSeq2(b: T[]): void {
|
||||
if (b === this.b) return;
|
||||
this.b = [...b];
|
||||
this.matchingBlocks = null;
|
||||
this.opcodes = null;
|
||||
this.fullbcount = null;
|
||||
this.chainB();
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze sequence b and build lookup structures
|
||||
*/
|
||||
private chainB(): void {
|
||||
const b = this.b;
|
||||
this.bjunk = new Set();
|
||||
this.bpopular = new Set();
|
||||
this.b2j = new Map();
|
||||
|
||||
// Count occurrences of each element
|
||||
const elementCounts = new Map<T, number>();
|
||||
for (const element of b) {
|
||||
elementCounts.set(element, (elementCounts.get(element) || 0) + 1);
|
||||
}
|
||||
|
||||
// Determine junk and popular elements
|
||||
const n = b.length;
|
||||
const popularThreshold = Math.floor(n / 100) + 1; // > 1% of sequence
|
||||
|
||||
for (const [element, count] of elementCounts) {
|
||||
if (this.isjunk && this.isjunk(element)) {
|
||||
this.bjunk.add(element);
|
||||
} else if (this.autojunk && n >= 200 && count > popularThreshold) {
|
||||
this.bpopular.add(element);
|
||||
}
|
||||
}
|
||||
|
||||
// Build position mapping for non-junk, non-popular elements
|
||||
for (let i = 0; i < b.length; i++) {
|
||||
const element = b[i];
|
||||
if (!this.bjunk.has(element) && !this.bpopular.has(element)) {
|
||||
if (!this.b2j.has(element)) {
|
||||
this.b2j.set(element, []);
|
||||
}
|
||||
this.b2j.get(element)!.push(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the longest matching block in a[alo:ahi] and b[blo:bhi]
|
||||
*/
|
||||
findLongestMatch(
|
||||
alo: number = 0,
|
||||
ahi: number | null = null,
|
||||
blo: number = 0,
|
||||
bhi: number | null = null,
|
||||
): Match {
|
||||
if (ahi === null) ahi = this.a.length;
|
||||
if (bhi === null) bhi = this.b.length;
|
||||
|
||||
let besti = alo;
|
||||
let bestj = blo;
|
||||
let bestsize = 0;
|
||||
|
||||
// Find all positions where a[i] appears in b
|
||||
const j2len = new Map<number, number>();
|
||||
|
||||
for (let i = alo; i < ahi; i++) {
|
||||
const element = this.a[i];
|
||||
const positions = this.b2j.get(element) || [];
|
||||
const newj2len = new Map<number, number>();
|
||||
|
||||
for (const j of positions) {
|
||||
if (j < blo) continue;
|
||||
if (j >= bhi) break;
|
||||
|
||||
const prevLen = j2len.get(j - 1) || 0;
|
||||
const k = prevLen + 1;
|
||||
newj2len.set(j, k);
|
||||
|
||||
if (k > bestsize) {
|
||||
besti = i - k + 1;
|
||||
bestj = j - k + 1;
|
||||
bestsize = k;
|
||||
}
|
||||
}
|
||||
|
||||
j2len.clear();
|
||||
for (const [key, value] of newj2len) {
|
||||
j2len.set(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
// Extend match with junk elements
|
||||
while (
|
||||
besti > alo &&
|
||||
bestj > blo &&
|
||||
!this.isBJunk(this.b[bestj - 1]) &&
|
||||
this.elementsEqual(this.a[besti - 1], this.b[bestj - 1])
|
||||
) {
|
||||
besti--;
|
||||
bestj--;
|
||||
bestsize++;
|
||||
}
|
||||
|
||||
while (
|
||||
besti + bestsize < ahi &&
|
||||
bestj + bestsize < bhi &&
|
||||
!this.isBJunk(this.b[bestj + bestsize]) &&
|
||||
this.elementsEqual(this.a[besti + bestsize], this.b[bestj + bestsize])
|
||||
) {
|
||||
bestsize++;
|
||||
}
|
||||
|
||||
// Extend match with junk elements at the beginning
|
||||
while (besti > alo && bestj > blo && this.isBJunk(this.b[bestj - 1])) {
|
||||
besti--;
|
||||
bestj--;
|
||||
bestsize++;
|
||||
}
|
||||
|
||||
// Extend match with junk elements at the end
|
||||
while (
|
||||
besti + bestsize < ahi &&
|
||||
bestj + bestsize < bhi &&
|
||||
this.isBJunk(this.b[bestj + bestsize])
|
||||
) {
|
||||
bestsize++;
|
||||
}
|
||||
|
||||
return { a: besti, b: bestj, size: bestsize };
|
||||
}
|
||||
|
||||
/**
|
||||
* Return list of non-overlapping matching blocks
|
||||
*/
|
||||
getMatchingBlocks(): Match[] {
|
||||
if (this.matchingBlocks !== null) {
|
||||
return this.matchingBlocks;
|
||||
}
|
||||
|
||||
const matches: Match[] = [];
|
||||
this.getMatchingBlocksRecursive(
|
||||
0,
|
||||
this.a.length,
|
||||
0,
|
||||
this.b.length,
|
||||
matches,
|
||||
);
|
||||
|
||||
// Add sentinel
|
||||
matches.push({ a: this.a.length, b: this.b.length, size: 0 });
|
||||
|
||||
this.matchingBlocks = matches;
|
||||
return matches;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively find matching blocks
|
||||
*/
|
||||
private getMatchingBlocksRecursive(
|
||||
alo: number,
|
||||
ahi: number,
|
||||
blo: number,
|
||||
bhi: number,
|
||||
matches: Match[],
|
||||
): void {
|
||||
const match = this.findLongestMatch(alo, ahi, blo, bhi);
|
||||
|
||||
if (match.size > 0) {
|
||||
// Recurse on the pieces before and after the match
|
||||
if (alo < match.a && blo < match.b) {
|
||||
this.getMatchingBlocksRecursive(
|
||||
alo,
|
||||
match.a,
|
||||
blo,
|
||||
match.b,
|
||||
matches,
|
||||
);
|
||||
}
|
||||
|
||||
matches.push(match);
|
||||
|
||||
if (match.a + match.size < ahi && match.b + match.size < bhi) {
|
||||
this.getMatchingBlocksRecursive(
|
||||
match.a + match.size,
|
||||
ahi,
|
||||
match.b + match.size,
|
||||
bhi,
|
||||
matches,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return list of 5-tuples describing how to turn a into b
|
||||
*/
|
||||
getOpcodes(): OpCodeTuple[] {
|
||||
if (this.opcodes !== null) {
|
||||
return this.opcodes;
|
||||
}
|
||||
|
||||
let i = 0;
|
||||
let j = 0;
|
||||
const opcodes: OpCodeTuple[] = [];
|
||||
|
||||
for (const match of this.getMatchingBlocks()) {
|
||||
let tag: OpCode = "equal";
|
||||
|
||||
if (i < match.a && j < match.b) {
|
||||
tag = "replace";
|
||||
} else if (i < match.a) {
|
||||
tag = "delete";
|
||||
} else if (j < match.b) {
|
||||
tag = "insert";
|
||||
}
|
||||
|
||||
if (tag !== "equal") {
|
||||
opcodes.push({
|
||||
tag,
|
||||
i1: i,
|
||||
i2: match.a,
|
||||
j1: j,
|
||||
j2: match.b,
|
||||
});
|
||||
}
|
||||
|
||||
i = match.a + match.size;
|
||||
j = match.b + match.size;
|
||||
|
||||
// Don't add the sentinel match
|
||||
if (match.size > 0) {
|
||||
opcodes.push({
|
||||
tag: "equal",
|
||||
i1: match.a,
|
||||
i2: i,
|
||||
j1: match.b,
|
||||
j2: j,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
this.opcodes = opcodes;
|
||||
return opcodes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a measure of sequences' similarity (0.0-1.0)
|
||||
*/
|
||||
ratio(): number {
|
||||
const matches = this.getMatchingBlocks()
|
||||
.slice(0, -1) // Exclude sentinel
|
||||
.reduce((sum, match) => sum + match.size, 0);
|
||||
|
||||
const total = this.a.length + this.b.length;
|
||||
return total === 0 ? 1.0 : (2.0 * matches) / total;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an upper bound on ratio() relatively quickly
|
||||
*/
|
||||
quickRatio(): number {
|
||||
if (this.fullbcount === null) {
|
||||
this.fullbcount = new Map();
|
||||
for (const element of this.b) {
|
||||
this.fullbcount.set(
|
||||
element,
|
||||
(this.fullbcount.get(element) || 0) + 1,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let matches = 0;
|
||||
const tempCounts = new Map(this.fullbcount);
|
||||
|
||||
for (const element of this.a) {
|
||||
const count = tempCounts.get(element);
|
||||
if (count && count > 0) {
|
||||
matches++;
|
||||
tempCounts.set(element, count - 1);
|
||||
}
|
||||
}
|
||||
|
||||
const total = this.a.length + this.b.length;
|
||||
return total === 0 ? 1.0 : (2.0 * matches) / total;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an upper bound on ratio() very quickly
|
||||
*/
|
||||
realQuickRatio(): number {
|
||||
const total = this.a.length + this.b.length;
|
||||
return total === 0
|
||||
? 1.0
|
||||
: (2.0 * Math.min(this.a.length, this.b.length)) / total;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if element is junk in sequence b
|
||||
*/
|
||||
private isBJunk(element: T): boolean {
|
||||
return this.bjunk.has(element);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if two elements are equal
|
||||
*/
|
||||
private elementsEqual(a: T, b: T): boolean {
|
||||
return a === b;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility function to get close matches similar to Python's get_close_matches
|
||||
*/
|
||||
export function getCloseMatches<T>(
|
||||
word: T[],
|
||||
possibilities: T[][],
|
||||
n: number = 3,
|
||||
cutoff: number = 0.6,
|
||||
): T[][] {
|
||||
if (n <= 0) {
|
||||
throw new Error("n must be greater than 0");
|
||||
}
|
||||
|
||||
const matches: Array<{ sequence: T[]; ratio: number }> = [];
|
||||
|
||||
for (const possibility of possibilities) {
|
||||
const matcher = new SequenceMatcher(null, word, possibility);
|
||||
const ratio = matcher.ratio();
|
||||
|
||||
if (ratio >= cutoff) {
|
||||
matches.push({ sequence: possibility, ratio });
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by ratio (descending) and take top n
|
||||
matches.sort((a, b) => b.ratio - a.ratio);
|
||||
return matches.slice(0, n).map((match) => match.sequence);
|
||||
}
|
||||
|
||||
/**
|
||||
* String-specific version of SequenceMatcher for character-by-character comparison.
|
||||
* This class treats strings as sequences of characters while providing a string-friendly API.
|
||||
*/
|
||||
export class StringSequenceMatcher {
|
||||
private matcher: SequenceMatcher<string>;
|
||||
|
||||
constructor(
|
||||
isjunk: JunkFunction<string> | null = null,
|
||||
a: string = "",
|
||||
b: string = "",
|
||||
autojunk: boolean = true,
|
||||
) {
|
||||
this.matcher = new SequenceMatcher(
|
||||
isjunk,
|
||||
Array.from(a),
|
||||
Array.from(b),
|
||||
autojunk,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set both sequences to be compared
|
||||
*/
|
||||
setSeqs(a: string, b: string): void {
|
||||
this.matcher.setSeqs(Array.from(a), Array.from(b));
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the first sequence to be compared
|
||||
*/
|
||||
setSeq1(a: string): void {
|
||||
this.matcher.setSeq1(Array.from(a));
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the second sequence to be compared
|
||||
*/
|
||||
setSeq2(b: string): void {
|
||||
this.matcher.setSeq2(Array.from(b));
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the longest matching block in a[alo:ahi] and b[blo:bhi]
|
||||
*/
|
||||
findLongestMatch(
|
||||
alo: number = 0,
|
||||
ahi: number | null = null,
|
||||
blo: number = 0,
|
||||
bhi: number | null = null,
|
||||
): Match {
|
||||
return this.matcher.findLongestMatch(alo, ahi, blo, bhi);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return list of non-overlapping matching blocks
|
||||
*/
|
||||
getMatchingBlocks(): Match[] {
|
||||
return this.matcher.getMatchingBlocks();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return list of 5-tuples describing how to turn a into b
|
||||
*/
|
||||
getOpcodes(): OpCodeTuple[] {
|
||||
return this.matcher.getOpcodes();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a measure of sequences' similarity (0.0-1.0)
|
||||
*/
|
||||
ratio(): number {
|
||||
return this.matcher.ratio();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an upper bound on ratio() relatively quickly
|
||||
*/
|
||||
quickRatio(): number {
|
||||
return this.matcher.quickRatio();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an upper bound on ratio() very quickly
|
||||
*/
|
||||
realQuickRatio(): number {
|
||||
return this.matcher.realQuickRatio();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility function for string similarity
|
||||
*/
|
||||
export function getStringSimilarity(a: string, b: string): number {
|
||||
const matcher = new StringSequenceMatcher(null, a, b);
|
||||
return matcher.ratio();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get close string matches
|
||||
*/
|
||||
export function getCloseStringMatches(
|
||||
word: string,
|
||||
possibilities: string[],
|
||||
n: number = 3,
|
||||
cutoff: number = 0.6,
|
||||
): string[] {
|
||||
if (n <= 0) {
|
||||
throw new Error("n must be greater than 0");
|
||||
}
|
||||
|
||||
const matches: Array<{ string: string; ratio: number }> = [];
|
||||
|
||||
for (const possibility of possibilities) {
|
||||
const ratio = getStringSimilarity(word, possibility);
|
||||
|
||||
if (ratio >= cutoff) {
|
||||
matches.push({ string: possibility, ratio });
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by ratio (descending) and take top n
|
||||
matches.sort((a, b) => b.ratio - a.ratio);
|
||||
return matches.slice(0, n).map((match) => match.string);
|
||||
}
|
||||
Reference in New Issue
Block a user