From de29685b0b9d213ebeccccaeb3784fc1085a5ff6 Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Thu, 1 Aug 2024 12:57:39 +0200 Subject: [PATCH] fix: Improve perf of camel case word splitter. (#6019) --- packages/cspell-lib/api/api.d.ts | 2 +- packages/cspell-lib/api/rollup.config.mjs | 2 +- packages/cspell-lib/package.json | 18 +-- packages/cspell-lib/src/lib-cjs/index.cts | 6 - packages/cspell-lib/src/lib-cjs/pkg-info.cts | 3 - .../cspell-lib/src/lib-cjs/tsconfig.cjs.json | 12 -- packages/cspell-lib/src/lib-cjs/tsconfig.json | 3 - .../cspell-lib/src/lib-cjs/tsconfig.test.json | 14 -- .../Controller/configLoader/configLoader.ts | 2 +- .../src/lib/Settings/DefaultSettings.ts | 2 +- packages/cspell-lib/src/lib/pkg-info.mts | 22 +++ .../src/lib/textValidation/isWordValid.ts | 10 +- .../textValidation/lineValidatorFactory.ts | 35 +++-- .../cspell-lib/src/lib/util/resolveFile.ts | 2 +- packages/cspell-lib/src/lib/util/text.test.ts | 27 +++- packages/cspell-lib/src/lib/util/text.ts | 28 +--- packages/cspell-lib/src/lib/util/textRegex.ts | 6 +- packages/cspell-lib/src/lib/util/url.test.ts | 2 +- packages/cspell-lib/src/lib/util/url.ts | 2 +- .../src/lib/util/wordSplitter.perf.ts | 148 ++++++++++++++++++ .../cspell-lib/src/lib/util/wordSplitter.ts | 4 +- .../cspell-lib/src/test-util/tsconfig.json | 12 -- packages/cspell-lib/tsconfig.esm.json | 12 -- packages/cspell-lib/tsconfig.json | 9 +- .../logging/dictionary-logging.csv | 1 - pnpm-lock.yaml | 26 +-- 26 files changed, 281 insertions(+), 129 deletions(-) delete mode 100644 packages/cspell-lib/src/lib-cjs/index.cts delete mode 100644 packages/cspell-lib/src/lib-cjs/pkg-info.cts delete mode 100644 packages/cspell-lib/src/lib-cjs/tsconfig.cjs.json delete mode 100644 packages/cspell-lib/src/lib-cjs/tsconfig.json delete mode 100644 packages/cspell-lib/src/lib-cjs/tsconfig.test.json create mode 100644 packages/cspell-lib/src/lib/pkg-info.mts create mode 100644 packages/cspell-lib/src/lib/util/wordSplitter.perf.ts delete mode 100644 packages/cspell-lib/src/test-util/tsconfig.json delete mode 100644 packages/cspell-lib/tsconfig.esm.json diff --git a/packages/cspell-lib/api/api.d.ts b/packages/cspell-lib/api/api.d.ts index 5022a8f27f3..053d922c3b5 100644 --- a/packages/cspell-lib/api/api.d.ts +++ b/packages/cspell-lib/api/api.d.ts @@ -987,7 +987,7 @@ declare function splitCamelCaseWord(word: string): string[]; */ declare function match(reg: RegExp, text: string): Iterable; declare function matchStringToTextOffset(reg: RegExp, text: string): Iterable; -declare function matchToTextOffset(reg: RegExp, text: TextOffset): Iterable; +declare function matchToTextOffset(reg: RegExp, t: TextOffset): Iterable; declare function extractLinesOfText(text: string): Iterable; /** * Extract out whole words from a string of text. diff --git a/packages/cspell-lib/api/rollup.config.mjs b/packages/cspell-lib/api/rollup.config.mjs index 92cc662847b..873bfa9a3a7 100644 --- a/packages/cspell-lib/api/rollup.config.mjs +++ b/packages/cspell-lib/api/rollup.config.mjs @@ -2,7 +2,7 @@ import dts from 'rollup-plugin-dts'; const config = [ { - input: './dist/esm/index.d.ts', + input: './dist/lib/index.d.ts', output: [{ file: './api/api.d.ts', format: 'es' }], plugins: [dts()], }, diff --git a/packages/cspell-lib/package.json b/packages/cspell-lib/package.json index d50357edb7d..67051bdc76f 100644 --- a/packages/cspell-lib/package.json +++ b/packages/cspell-lib/package.json @@ -4,11 +4,11 @@ "description": "A library of useful functions used across various cspell tools.", "type": "module", "sideEffects": false, - "types": "dist/esm/index.d.ts", - "module": "dist/esm/index.js", + "types": "dist/lib/index.d.ts", + "module": "dist/lib/index.js", "exports": { ".": { - "import": "./dist/esm/index.js" + "import": "./dist/lib/index.js" } }, "files": [ @@ -24,17 +24,16 @@ "scripts": { "clean": "shx rm -rf dist temp coverage \"*.tsbuildInfo\"", "clean-build": "pnpm clean && pnpm build", - "build": "tsc -b . -f && pnpm run build:api", + "build": "tsc -p . && pnpm run build:api", "build:api": "rollup -c api/rollup.config.mjs", - "build:esm": "tsc -b tsconfig.esm.json -f", - "build:lib": "tsc -b src/lib/tsconfig.json -f", - "watch": "tsc -b . --watch -f", + "watch": "tsc -p . --watch", "coverage": "vitest run --coverage --pool=forks", "test-watch": "vitest", "prepublishOnly": "pnpm run clean-build", "#test": "vitest run --reporter=hanging-process --reporter=default", "test": "vitest run --pool=forks", - "test:update-snapshot": "vitest run -u" + "test:update-snapshot": "vitest run -u", + "test:perf": "NODE_ENV=production insight --register ts-node/esm --file \"**/*.perf.{mts,ts}\"" }, "repository": { "type": "git", @@ -99,6 +98,7 @@ "configstore": "^7.0.0", "cspell-dict-nl-nl": "^1.1.2", "leaked-handles": "^5.2.0", - "lorem-ipsum": "^2.0.8" + "lorem-ipsum": "^2.0.8", + "perf-insight": "^1.2.0" } } diff --git a/packages/cspell-lib/src/lib-cjs/index.cts b/packages/cspell-lib/src/lib-cjs/index.cts deleted file mode 100644 index 5c1ac8165ad..00000000000 --- a/packages/cspell-lib/src/lib-cjs/index.cts +++ /dev/null @@ -1,6 +0,0 @@ -/** - * This module contains CJS only files. - * It includes files that use 3rd part libs that can only be CJS due to their exports. - */ - -export { srcDirectory } from './pkg-info.cjs'; diff --git a/packages/cspell-lib/src/lib-cjs/pkg-info.cts b/packages/cspell-lib/src/lib-cjs/pkg-info.cts deleted file mode 100644 index b655ffb718d..00000000000 --- a/packages/cspell-lib/src/lib-cjs/pkg-info.cts +++ /dev/null @@ -1,3 +0,0 @@ -// import { join } from 'path'; -// export const srcDirectory = join(__dirname, '/'); -export const srcDirectory = __dirname; diff --git a/packages/cspell-lib/src/lib-cjs/tsconfig.cjs.json b/packages/cspell-lib/src/lib-cjs/tsconfig.cjs.json deleted file mode 100644 index b07df51ade6..00000000000 --- a/packages/cspell-lib/src/lib-cjs/tsconfig.cjs.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "extends": "../../../../tsconfig.esm.json", - "compilerOptions": { - "allowJs": false, - "composite": true, - "tsBuildInfoFile": "../../temp/compile.lib-cjs.tsbuildInfo", - "rootDir": ".", - "outDir": "../../dist/lib-cjs", - "types": ["node"] - }, - "files": ["index.cts", "pkg-info.cts"] -} diff --git a/packages/cspell-lib/src/lib-cjs/tsconfig.json b/packages/cspell-lib/src/lib-cjs/tsconfig.json deleted file mode 100644 index eda43c7376d..00000000000 --- a/packages/cspell-lib/src/lib-cjs/tsconfig.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "references": [{ "path": "./tsconfig.cjs.json" }, { "path": "./tsconfig.test.json" }] -} diff --git a/packages/cspell-lib/src/lib-cjs/tsconfig.test.json b/packages/cspell-lib/src/lib-cjs/tsconfig.test.json deleted file mode 100644 index 7fbbcb0faf2..00000000000 --- a/packages/cspell-lib/src/lib-cjs/tsconfig.test.json +++ /dev/null @@ -1,14 +0,0 @@ -/* - Note: this configuration is needed to work around a bug in vite / vitest that cannot - convert .cts files to .cjs. - */ -{ - "extends": "./tsconfig.cjs.json", - "compilerOptions": { - "declarationMap": false, - "sourceMap": false, - "tsBuildInfoFile": "../../temp/compile.lib-test.tsbuildInfo", - "outDir": ".", - "types": ["node"] - } -} diff --git a/packages/cspell-lib/src/lib/Settings/Controller/configLoader/configLoader.ts b/packages/cspell-lib/src/lib/Settings/Controller/configLoader/configLoader.ts index 954028c9dd0..7dd7d3c0698 100644 --- a/packages/cspell-lib/src/lib/Settings/Controller/configLoader/configLoader.ts +++ b/packages/cspell-lib/src/lib/Settings/Controller/configLoader/configLoader.ts @@ -8,11 +8,11 @@ import { createReaderWriter, CSpellConfigFileInMemory } from 'cspell-config-lib' import { isUrlLike, toFileURL } from 'cspell-io'; import { URI, Utils as UriUtils } from 'vscode-uri'; -import { srcDirectory } from '../../../../lib-cjs/index.cjs'; import { onClearCache } from '../../../events/index.js'; import type { VFileSystem } from '../../../fileSystem.js'; import { getVirtualFS } from '../../../fileSystem.js'; import { createCSpellSettingsInternal as csi } from '../../../Models/CSpellSettingsInternalDef.js'; +import { srcDirectory } from '../../../pkg-info.mjs'; import { autoResolve, AutoResolveCache, autoResolveWeak } from '../../../util/AutoResolve.js'; import { logError, logWarning } from '../../../util/logger.js'; import { FileResolver } from '../../../util/resolveFile.js'; diff --git a/packages/cspell-lib/src/lib/Settings/DefaultSettings.ts b/packages/cspell-lib/src/lib/Settings/DefaultSettings.ts index 7437b84288d..9940b039cc7 100644 --- a/packages/cspell-lib/src/lib/Settings/DefaultSettings.ts +++ b/packages/cspell-lib/src/lib/Settings/DefaultSettings.ts @@ -1,10 +1,10 @@ import type { PredefinedPatterns, RegExpPatternDefinition } from '@cspell/cspell-types'; import { parsers } from 'cspell-grammar'; -import { srcDirectory } from '../../lib-cjs/index.cjs'; import type { CSpellSettingsInternal } from '../Models/CSpellSettingsInternalDef.js'; import { createCSpellSettingsInternal } from '../Models/CSpellSettingsInternalDef.js'; import { PatternRegExp } from '../Models/PatternRegExp.js'; +import { srcDirectory } from '../pkg-info.mjs'; import { resolveFile } from '../util/resolveFile.js'; import { defaultConfigFileModuleRef } from './constants.js'; import { readSettings } from './Controller/configLoader/index.js'; diff --git a/packages/cspell-lib/src/lib/pkg-info.mts b/packages/cspell-lib/src/lib/pkg-info.mts new file mode 100644 index 00000000000..732dbace653 --- /dev/null +++ b/packages/cspell-lib/src/lib/pkg-info.mts @@ -0,0 +1,22 @@ +import { fileURLToPath } from 'node:url'; + +/** + * This is the url of the current file, but it might be undefined if the environment does not support it. + */ +const url = import.meta.url; + +/** + * The is the CommonJS __dirname variable, but it might not be defined. + * ESBuild and some other bundlers do support it. + */ +declare const __dirname: string; + +function calcSrcDirectory() { + try { + return __dirname; + } catch { + return url ? fileURLToPath(new URL('./', url)) : process.cwd(); + } +} + +export const srcDirectory = calcSrcDirectory(); diff --git a/packages/cspell-lib/src/lib/textValidation/isWordValid.ts b/packages/cspell-lib/src/lib/textValidation/isWordValid.ts index 0fcd79a6ec9..8e6449b6d5e 100644 --- a/packages/cspell-lib/src/lib/textValidation/isWordValid.ts +++ b/packages/cspell-lib/src/lib/textValidation/isWordValid.ts @@ -1,13 +1,15 @@ -import type { CachingDictionary } from 'cspell-dictionary'; - import type { TextOffsetRO } from './ValidationTypes.js'; -function hasWordCheck(dict: CachingDictionary, word: string): boolean { +interface Dict { + has(word: string): boolean; +} + +function hasWordCheck(dict: Dict, word: string): boolean { word = word.includes('\\') ? word.replaceAll('\\', '') : word; return dict.has(word); } -export function isWordValidWithEscapeRetry(dict: CachingDictionary, wo: TextOffsetRO, line: TextOffsetRO): boolean { +export function isWordValidWithEscapeRetry(dict: Dict, wo: TextOffsetRO, line: TextOffsetRO): boolean { const firstTry = hasWordCheck(dict, wo.text); return ( firstTry || diff --git a/packages/cspell-lib/src/lib/textValidation/lineValidatorFactory.ts b/packages/cspell-lib/src/lib/textValidation/lineValidatorFactory.ts index 7cc1e0166d7..67f185c0822 100644 --- a/packages/cspell-lib/src/lib/textValidation/lineValidatorFactory.ts +++ b/packages/cspell-lib/src/lib/textValidation/lineValidatorFactory.ts @@ -5,7 +5,12 @@ import { createCachingDictionary } from 'cspell-dictionary'; import type { ValidationIssue } from '../Models/ValidationIssue.js'; import * as RxPat from '../Settings/RegExpPatterns.js'; -import * as Text from '../util/text.js'; +import { + extractPossibleWordsFromTextOffset, + extractText, + extractWordsFromCodeTextOffset, + extractWordsFromTextOffset, +} from '../util/text.js'; import { split } from '../util/wordSplitter.js'; import { defaultMinWordLength } from './defaultConstants.js'; import { isWordValidWithEscapeRetry } from './isWordValid.js'; @@ -64,6 +69,17 @@ export function lineValidatorFactory(sDict: SpellingDictionary, options: Validat return !setOfKnownSuccessfulWords.has(wo.text); }; + const hasDict = { + has(word: string): boolean { + const info = getWordInfo(word); + if (info.isFound !== undefined) return info.isFound; + if (info.isFlagged) return true; + if (info.isFlagged) return false; + info.isFound = dictCol.has(word); + return info.isFound; + }, + }; + function calcIgnored(info: WordStatusInfo): boolean { info.isIgnored ??= dictCol.isNoSuggestWord(info.word); return info.isIgnored; @@ -116,17 +132,16 @@ export function lineValidatorFactory(sDict: SpellingDictionary, options: Validat const { isFlagged: isForbidden, isFound, isIgnored } = info; const isFlagged = issue.isFlagged ?? (!isIgnored && isForbidden); issue.isFlagged = isFlagged; - issue.isFound = isFound; + issue.isFound = isFlagged ? undefined : isFound; return issue; } const isIgnored = calcIgnored(info); const isFlagged = issue.isFlagged ?? calcFlagged(info); - const isFound = isFlagged ? undefined : isIgnored || isWordValidWithEscapeRetry(dictCol, issue, issue.line); + info.isFound ??= isFlagged ? false : isIgnored || isWordValidWithEscapeRetry(hasDict, issue, issue.line); info.isFlagged = !!isFlagged; - info.isFound = isFound; info.fin = true; issue.isFlagged = isFlagged; - issue.isFound = isFound; + issue.isFound = isFlagged ? undefined : info.isFound; return issue; } @@ -134,7 +149,7 @@ export function lineValidatorFactory(sDict: SpellingDictionary, options: Validat function splitterIsValid(word: TextOffsetRO): boolean { return ( setOfKnownSuccessfulWords.has(word.text) || - (!isWordFlagged(word) && isWordValidWithEscapeRetry(dictCol, word, lineSegment.line)) + (!isWordFlagged(word) && isWordValidWithEscapeRetry(hasDict, word, lineSegment.line)) ); } @@ -145,7 +160,7 @@ export function lineValidatorFactory(sDict: SpellingDictionary, options: Validat const codeWordResults: ValidationIssueRO[] = []; - for (const wo of Text.extractWordsFromCodeTextOffset(vr)) { + for (const wo of extractWordsFromCodeTextOffset(vr)) { if (setOfKnownSuccessfulWords.has(wo.text)) continue; const issue = wo as ValidationIssue; issue.line = vr.line; @@ -155,7 +170,7 @@ export function lineValidatorFactory(sDict: SpellingDictionary, options: Validat if (!isFlaggedOrMinLength(issue)) continue; checkWord(issue); if (!isFlaggedOrNotFound(issue) || !isNotRepeatingChar(issue)) continue; - issue.text = Text.extractText(lineSegment.segment, issue.offset, issue.offset + issue.text.length); + issue.text = extractText(lineSegment.segment, issue.offset, issue.offset + issue.text.length); codeWordResults.push(issue); } @@ -178,7 +193,7 @@ export function lineValidatorFactory(sDict: SpellingDictionary, options: Validat } const mismatches: ValidationIssue[] = []; - for (const wo of Text.extractWordsFromTextOffset(possibleWord)) { + for (const wo of extractWordsFromTextOffset(possibleWord)) { if (setOfKnownSuccessfulWords.has(wo.text)) continue; const issue = wo as ValidationIssue; issue.line = lineSegment.line; @@ -200,7 +215,7 @@ export function lineValidatorFactory(sDict: SpellingDictionary, options: Validat } const checkedPossibleWords: Iterable = pipe( - Text.extractPossibleWordsFromTextOffset(lineSegment.segment), + extractPossibleWordsFromTextOffset(lineSegment.segment), opFilter(filterAlreadyChecked), opConcatMap(checkPossibleWords), opMap(annotateIssue), diff --git a/packages/cspell-lib/src/lib/util/resolveFile.ts b/packages/cspell-lib/src/lib/util/resolveFile.ts index 57f02675b0f..541b4118fd6 100644 --- a/packages/cspell-lib/src/lib/util/resolveFile.ts +++ b/packages/cspell-lib/src/lib/util/resolveFile.ts @@ -9,8 +9,8 @@ import { importResolveModuleName } from '@cspell/dynamic-import'; import type { VFileSystem } from 'cspell-io'; import resolveFrom from 'resolve-from'; -import { srcDirectory } from '../../lib-cjs/pkg-info.cjs'; import { getFileSystem } from '../fileSystem.js'; +import { srcDirectory } from '../pkg-info.mjs'; import { envToTemplateVars, replaceTemplate } from './templates.js'; import { fileURLOrPathToPath, diff --git a/packages/cspell-lib/src/lib/util/text.test.ts b/packages/cspell-lib/src/lib/util/text.test.ts index 2f21b38c0c3..cb81bebb526 100644 --- a/packages/cspell-lib/src/lib/util/text.test.ts +++ b/packages/cspell-lib/src/lib/util/text.test.ts @@ -9,6 +9,7 @@ import { describe, expect, test } from 'vitest'; import * as Text from './text.js'; import { splitCamelCaseWord } from './text.js'; +import { regExSplitWords, regExSplitWords2, regExUpperSOrIng } from './textRegex.js'; // cSpell:ignore Ápple DBAs ctrip γάμμα @@ -28,6 +29,19 @@ describe('Util Text', () => { expect(Text.stringToRegExp(pattern)).toEqual(expected); }); + test.each` + word | expected + ${'hello'} | ${'hello'.split('|')} + ${'helloThere'} | ${['hello', 'There']} + ${'HelloThere'} | ${['Hello', 'There']} + ${'BigÁpple'} | ${['Big', 'Ápple']} + ${'ASCIIToUTF16'} | ${['ASCII', 'To', 'UTF16']} + ${'URLsAndDBAs'} | ${['URLs', 'And', 'DBAs']} + ${'WALKingRUNning'} | ${['WALKing', 'RUNning']} + `('splitCamelCaseWord $word', ({ word, expected }) => { + expect(splitCamelCaseWord(word)).toEqual(expected); + }); + test.each` word | expected ${'hello'} | ${'hello'.split('|')} @@ -38,7 +52,7 @@ describe('Util Text', () => { ${'URLsAndDBAs'} | ${['Urls', 'And', 'Dbas']} ${'WALKingRUNning'} | ${['Walking', 'Running']} `('splitCamelCaseWord $word', ({ word, expected }) => { - expect(splitCamelCaseWord(word)).toEqual(expected); + expect(splitCamelCaseWordOrig(word)).toEqual(expected); }); test('extract word from text', () => { @@ -473,3 +487,14 @@ SQL; Not checked. `; + +/** + * Split camelCase words into an array of strings. + */ +function splitCamelCaseWordOrig(word: string): string[] { + const wPrime = word.replace(regExUpperSOrIng, (s) => s[0] + s.slice(1).toLowerCase()); + const separator = '_<^*_*^>_'; + const pass1 = wPrime.replace(regExSplitWords, '$1' + separator + '$2'); + const pass2 = pass1.replace(regExSplitWords2, '$1' + separator + '$2'); + return pass2.split(separator); +} diff --git a/packages/cspell-lib/src/lib/util/text.ts b/packages/cspell-lib/src/lib/util/text.ts index e5ccff60169..f17190d92d5 100644 --- a/packages/cspell-lib/src/lib/util/text.ts +++ b/packages/cspell-lib/src/lib/util/text.ts @@ -8,9 +8,7 @@ import { regExAllUpper, regExFirstUpper, regExIgnoreCharacters, - regExSplitWords, - regExSplitWords2, - regExUpperSOrIng, + regExpSplitWordBreaks, regExWords, regExWordsAndDigits, } from './textRegex.js'; @@ -35,11 +33,7 @@ export function splitCamelCaseWordWithOffset(wo: TextOffset): Array * Split camelCase words into an array of strings. */ export function splitCamelCaseWord(word: string): string[] { - const wPrime = word.replace(regExUpperSOrIng, (s) => s[0] + s.slice(1).toLowerCase()); - const separator = '_<^*_*^>_'; - const pass1 = wPrime.replace(regExSplitWords, '$1' + separator + '$2'); - const pass2 = pass1.replace(regExSplitWords2, '$1' + separator + '$2'); - return pass2.split(separator); + return word.split(regExpSplitWordBreaks); } /** @@ -55,12 +49,13 @@ export function matchStringToTextOffset(reg: RegExp, text: string): Iterable { - const textOffset = text; - const fnOffsetMap = offsetMap(textOffset.offset); +export function matchToTextOffset(reg: RegExp, t: TextOffset): Iterable { + const text = t.text; + const offset = t.offset; + // return opMap((m: RegExpExecArray) => ({ text: m[0], offset: offset + m.index }))(match(reg, text)); return pipe( - match(reg, textOffset.text), - opMap((m) => fnOffsetMap({ text: m[0], offset: m.index || 0 })), + match(reg, text), + opMap((m) => ({ text: m[0], offset: offset + m.index })), ); } @@ -182,13 +177,6 @@ export function extractText(textOffset: TextOffset, startPos: number, endPos: nu return text.slice(a, b); } -interface OffsetMap { - offset: number; -} -function offsetMap(offset: number) { - return (xo: T) => ({ ...xo, offset: xo.offset + offset }) as T; -} - export function calculateTextDocumentOffsets( uri: string | Uri | URL, doc: string, diff --git a/packages/cspell-lib/src/lib/util/textRegex.ts b/packages/cspell-lib/src/lib/util/textRegex.ts index c0bc3104515..54894bfebf7 100644 --- a/packages/cspell-lib/src/lib/util/textRegex.ts +++ b/packages/cspell-lib/src/lib/util/textRegex.ts @@ -1,8 +1,12 @@ // cspell:ignore ings ning gimuy anrvtbf gimuxy -export const regExUpperSOrIng = /([\p{Lu}\p{M}]+\\?['’]?(?:s|ing|ies|es|ings|ed|ning))(?!\p{Ll})/gu; +export const regExUpperSOrIng = /([\p{Lu}\p{M}]+(?:\\?['’])?(?:s|ing|ies|es|ings|ed|ning))(?!\p{Ll})/gu; export const regExSplitWords = /(\p{Ll}\p{M}?)(\p{Lu})/gu; export const regExSplitWords2 = /(\p{Lu}\p{M}?)(\p{Lu}\p{M}?\p{Ll})/gu; +export const regExpSplitWordBreaks = + /(?<=\p{Ll}\p{M}?)(?=\p{Lu})|(?<=\p{Lu}\p{M}?)(?=\p{Lu}\p{M}?\p{Ll})(?!\p{Lu}\p{M}?(?:s|ing|ies|es|ings|ed|ning)(?!\p{Ll}))/gu; +export const regExpAllPossibleWordBreaks = + /(?<=\p{Ll}\p{M}?)(?=\p{Lu})|(?<=\p{Lu}\p{M}?)(?=\p{Lu}\p{M}?\p{Ll})|(?<=\p{Lu}\p{M}?\p{Lu}\p{M}?)(?=\p{Ll})|(?<=\p{L}\p{M}?)(?=\P{L})|(?<=\P{L})(?=\p{L})/gu; export const regExWords = /\p{L}\p{M}?(?:(?:\\?['’])?\p{L}\p{M}?)*/gu; // Words can be made of letters, numbers, period, underscore, dash, plus, and single quote export const regExWordsAndDigits = /[\p{L}\w'’`.+-](?:(?:\\(?=[']))?[\p{L}\p{M}\w'’`.+-])*/gu; diff --git a/packages/cspell-lib/src/lib/util/url.test.ts b/packages/cspell-lib/src/lib/util/url.test.ts index 3c64dce1412..d0e2393745d 100644 --- a/packages/cspell-lib/src/lib/util/url.test.ts +++ b/packages/cspell-lib/src/lib/util/url.test.ts @@ -3,7 +3,7 @@ import { fileURLToPath, pathToFileURL } from 'node:url'; import { describe, expect, test } from 'vitest'; -import { srcDirectory } from '../../lib-cjs/pkg-info.cjs'; +import { srcDirectory } from '../pkg-info.mjs'; import { cwdURL, getSourceDirectoryUrl, diff --git a/packages/cspell-lib/src/lib/util/url.ts b/packages/cspell-lib/src/lib/util/url.ts index 3d7d6d1580e..ed6f5378ea7 100644 --- a/packages/cspell-lib/src/lib/util/url.ts +++ b/packages/cspell-lib/src/lib/util/url.ts @@ -3,7 +3,7 @@ import { pathToFileURL } from 'node:url'; import { toFilePathOrHref, toFileURL } from '@cspell/url'; -import { srcDirectory } from '../../lib-cjs/pkg-info.cjs'; +import { srcDirectory } from '../pkg-info.mjs'; export { addTrailingSlash, diff --git a/packages/cspell-lib/src/lib/util/wordSplitter.perf.ts b/packages/cspell-lib/src/lib/util/wordSplitter.perf.ts new file mode 100644 index 00000000000..d637cfa6d4c --- /dev/null +++ b/packages/cspell-lib/src/lib/util/wordSplitter.perf.ts @@ -0,0 +1,148 @@ +import { readFile } from 'node:fs/promises'; +import { extname } from 'node:path/posix'; + +import { TextOffset } from '@cspell/cspell-types'; +import { suite } from 'perf-insight'; + +import { + extractPossibleWordsFromTextOffset, + extractWordsFromCode, + extractWordsFromText, + matchStringToTextOffset, + splitCamelCaseWordWithOffset, + textOffset, +} from './text.js'; +import { regExWordsAndDigits } from './textRegex.js'; + +const regExpWord = /\b[\w\p{L}\p{M}]+\b/gu; + +suite('wordSplitter', async (test) => { + const lines = await sampleLines(); + + const iterations = 1; + + test('baseline: matchAll /[\\w\\p{L}\\p{M}]+/gu', () => { + const s: TextOffset[] = []; + const _regExpWord = regExpWord; + for (let i = iterations; i > 0; --i) { + for (const line of lines) { + for (const m of line.matchAll(_regExpWord)) { + s.push({ text: m[0], offset: m.index }); + } + } + } + return s; + }); + + test('baseline: matchAll non-space /\\S+/g', () => { + const s: TextOffset[] = []; + const regExp = /\S+/g; + for (let i = iterations; i > 0; --i) { + for (const line of lines) { + s.push(...[...line.matchAll(regExp)].map((a) => ({ text: a[0], offset: a.index }))); + } + } + return s; + }); + + test('baseline: matchAll non-special', () => { + const s: TextOffset[] = []; + const regExp = /[^\s();:{}[\]*&^%$#@!~"?/\\,<>+=]+/g; + for (let i = iterations; i > 0; --i) { + for (const line of lines) { + s.push(...[...line.matchAll(regExp)].map((a) => ({ text: a[0], offset: a.index }))); + } + } + return s; + }); + + test('baseline: matchAll regExWordsAndDigits', () => { + const s: TextOffset[] = []; + const regExp = regExWordsAndDigits; + for (let i = iterations; i > 0; --i) { + for (const line of lines) { + s.push(...[...line.matchAll(regExp)].map((a) => ({ text: a[0], offset: a.index }))); + } + } + return s; + }); + + test('baseline: matchAll regExWordsAndDigits matchStringToTextOffset', () => { + const s: TextOffset[] = []; + const regExp = regExWordsAndDigits; + for (let i = iterations; i > 0; --i) { + for (const line of lines) { + s.push(...matchStringToTextOffset(regExp, line)); + } + } + return s; + }); + + test('matchAll into possible words', () => { + const s: TextOffset[] = []; + const regExpMaybeWord = /[^\s();:{}[\]*&^%$#@!~"?/\\,<>+=]+/g; + const _regExpWord = regExpWord; + for (let i = iterations; i > 0; --i) { + for (const line of lines) { + for (const m of line.matchAll(regExpMaybeWord)) { + const index = m.index; + for (const words of m[0].matchAll(_regExpWord)) { + s.push({ text: words[0], offset: index + words.index }); + } + } + } + } + return s; + }); + + test('extractWordsFromText', () => { + const s: TextOffset[] = []; + for (let i = iterations; i > 0; --i) { + for (const line of lines) { + s.push(...extractWordsFromText(line)); + } + } + return s; + }); + + test('extractPossibleWordsFromTextOffset', () => { + const s: TextOffset[] = []; + for (let i = iterations; i > 0; --i) { + for (const line of lines) { + s.push(...extractPossibleWordsFromTextOffset(textOffset(line))); + } + } + return s; + }); + + test('extractPossibleWordsFromTextOffset splitCamelCaseWordWithOffset', () => { + const s: TextOffset[] = []; + for (let i = iterations; i > 0; --i) { + for (const line of lines) { + for (const wo of extractPossibleWordsFromTextOffset(textOffset(line))) { + for (const cWord of splitCamelCaseWordWithOffset(wo)) { + s.push(cWord); + } + } + } + } + return s; + }); + + test('extractWordsFromCode', () => { + const s: TextOffset[] = []; + for (let i = iterations; i > 0; --i) { + for (const line of lines) { + s.push(...extractWordsFromCode(line)); + } + } + return s; + }); +}); + +async function sampleLines() { + const ext = extname(new URL(import.meta.url).pathname); + const url = new URL('wordSplitter.test' + ext, import.meta.url); + const context = await readFile(url, 'utf8'); + return context.replaceAll('\r\n', '\n').replaceAll('\r', '\n').split('\n'); +} diff --git a/packages/cspell-lib/src/lib/util/wordSplitter.ts b/packages/cspell-lib/src/lib/util/wordSplitter.ts index df4345a3dd1..1e8de270ab9 100644 --- a/packages/cspell-lib/src/lib/util/wordSplitter.ts +++ b/packages/cspell-lib/src/lib/util/wordSplitter.ts @@ -13,7 +13,7 @@ import { regExWordsAndDigits, } from './textRegex.js'; -const ignoreBreak: readonly number[] = Object.freeze([] as number[]); +const ignoreBreak: BreakPairs = Object.freeze([]) as unknown as BreakPairs; export type IsValidWordFn = (word: TextOffset) => boolean; @@ -150,7 +150,7 @@ function findNextWordText({ text, offset }: TextOffset): TextOffset { }; } -type BreakPairs = readonly number[]; +type BreakPairs = readonly [number, number]; interface PossibleWordBreak { /** offset from the start of the string */ diff --git a/packages/cspell-lib/src/test-util/tsconfig.json b/packages/cspell-lib/src/test-util/tsconfig.json deleted file mode 100644 index 9dbbc9aa151..00000000000 --- a/packages/cspell-lib/src/test-util/tsconfig.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "extends": "../../../../tsconfig.esm.json", - "compilerOptions": { - "allowJs": false, - "composite": true, - "tsBuildInfoFile": "../../temp/compile.test.tsbuildInfo", - "rootDir": ".", - "outDir": "../../dist/test", - "types": ["node"] - }, - "include": ["."] -} diff --git a/packages/cspell-lib/tsconfig.esm.json b/packages/cspell-lib/tsconfig.esm.json deleted file mode 100644 index aad907e2f9c..00000000000 --- a/packages/cspell-lib/tsconfig.esm.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "extends": "../../tsconfig.esm.json", - "compilerOptions": { - "composite": true, - "tsBuildInfoFile": "temp/compile.esm.tsbuildInfo", - "rootDir": "src/lib", - "outDir": "dist/esm", - "types": ["node"] - }, - "include": ["src/lib", "src/lib-cjs/vscode-uri.cts"], - "references": [{ "path": "./src/lib-cjs/tsconfig.cjs.json" }, { "path": "./src/test-util" }] -} diff --git a/packages/cspell-lib/tsconfig.json b/packages/cspell-lib/tsconfig.json index b8610c4a5da..30891b89672 100644 --- a/packages/cspell-lib/tsconfig.json +++ b/packages/cspell-lib/tsconfig.json @@ -1,4 +1,9 @@ { - "files": [], - "references": [{ "path": "./tsconfig.esm.json" }, { "path": "./src/lib-cjs" }, { "path": "./src/test-util" }] + "extends": "../../tsconfig.esm.json", + "compilerOptions": { + "rootDir": "src", + "outDir": "dist", + "types": ["node"] + }, + "include": ["src"] } diff --git a/packages/cspell/src/app/lint/__snapshots__/logging/dictionary-logging.csv b/packages/cspell/src/app/lint/__snapshots__/logging/dictionary-logging.csv index 9974cbf344a..4cc79d08dc0 100644 --- a/packages/cspell/src/app/lint/__snapshots__/logging/dictionary-logging.csv +++ b/packages/cspell/src/app/lint/__snapshots__/logging/dictionary-logging.csv @@ -1128,7 +1128,6 @@ chapter, true starts, true Chapter, true Tufte, false -Tufte, false SECTION, true Section, true section, true diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index db43dc5f7d8..1733854d38c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -64,10 +64,10 @@ importers: version: 9.8.0 eslint-import-resolver-typescript: specifier: ^3.6.1 - version: 3.6.1(eslint-plugin-import@2.29.1)(eslint@9.8.0) + version: 3.6.1(@typescript-eslint/parser@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint-plugin-import@2.29.1)(eslint@9.8.0) eslint-plugin-jest: specifier: ^28.6.0 - version: 28.6.0(@typescript-eslint/eslint-plugin@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint@9.8.0)(jest@29.7.0(@types/node@18.19.42)(ts-node@10.9.2(@types/node@18.19.42)(typescript@5.5.4)))(typescript@5.5.4) + version: 28.6.0(@typescript-eslint/eslint-plugin@7.18.0(@typescript-eslint/parser@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint@9.8.0)(typescript@5.5.4))(eslint@9.8.0)(jest@29.7.0(@types/node@18.19.42)(ts-node@10.9.2(@types/node@18.19.42)(typescript@5.5.4)))(typescript@5.5.4) eslint-plugin-n: specifier: ^17.10.1 version: 17.10.1(eslint@9.8.0) @@ -749,6 +749,9 @@ importers: lorem-ipsum: specifier: ^2.0.8 version: 2.0.8 + perf-insight: + specifier: ^1.2.0 + version: 1.2.0 packages/cspell-pipe: devDependencies: @@ -14958,13 +14961,13 @@ snapshots: transitivePeerDependencies: - supports-color - eslint-import-resolver-typescript@3.6.1(eslint-plugin-import@2.29.1)(eslint@9.8.0): + eslint-import-resolver-typescript@3.6.1(@typescript-eslint/parser@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint-plugin-import@2.29.1)(eslint@9.8.0): dependencies: debug: 4.3.6(supports-color@8.1.1) enhanced-resolve: 5.17.1 eslint: 9.8.0 - eslint-module-utils: 2.8.1(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.1(eslint-plugin-import@2.29.1)(eslint@9.8.0))(eslint@9.8.0) - eslint-plugin-import: 2.29.1(eslint-import-resolver-typescript@3.6.1)(eslint@9.8.0) + eslint-module-utils: 2.8.1(@typescript-eslint/parser@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.1(@typescript-eslint/parser@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint-plugin-import@2.29.1)(eslint@9.8.0))(eslint@9.8.0) + eslint-plugin-import: 2.29.1(@typescript-eslint/parser@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint-import-resolver-typescript@3.6.1)(eslint@9.8.0) fast-glob: 3.3.2 get-tsconfig: 4.7.6 is-core-module: 2.15.0 @@ -14996,13 +14999,14 @@ snapshots: - bluebird - supports-color - eslint-module-utils@2.8.1(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.1(eslint-plugin-import@2.29.1)(eslint@9.8.0))(eslint@9.8.0): + eslint-module-utils@2.8.1(@typescript-eslint/parser@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.1(@typescript-eslint/parser@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint-plugin-import@2.29.1)(eslint@9.8.0))(eslint@9.8.0): dependencies: debug: 3.2.7 optionalDependencies: + '@typescript-eslint/parser': 7.18.0(eslint@9.8.0)(typescript@5.5.4) eslint: 9.8.0 eslint-import-resolver-node: 0.3.9 - eslint-import-resolver-typescript: 3.6.1(eslint-plugin-import@2.29.1)(eslint@9.8.0) + eslint-import-resolver-typescript: 3.6.1(@typescript-eslint/parser@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint-plugin-import@2.29.1)(eslint@9.8.0) transitivePeerDependencies: - supports-color @@ -15020,7 +15024,7 @@ snapshots: eslint: 9.8.0 eslint-compat-utils: 0.5.1(eslint@9.8.0) - eslint-plugin-import@2.29.1(eslint-import-resolver-typescript@3.6.1)(eslint@9.8.0): + eslint-plugin-import@2.29.1(@typescript-eslint/parser@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint-import-resolver-typescript@3.6.1)(eslint@9.8.0): dependencies: array-includes: 3.1.8 array.prototype.findlastindex: 1.2.5 @@ -15030,7 +15034,7 @@ snapshots: doctrine: 2.1.0 eslint: 9.8.0 eslint-import-resolver-node: 0.3.9 - eslint-module-utils: 2.8.1(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.1(eslint-plugin-import@2.29.1)(eslint@9.8.0))(eslint@9.8.0) + eslint-module-utils: 2.8.1(@typescript-eslint/parser@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.1(@typescript-eslint/parser@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint-plugin-import@2.29.1)(eslint@9.8.0))(eslint@9.8.0) hasown: 2.0.2 is-core-module: 2.15.0 is-glob: 4.0.3 @@ -15040,12 +15044,14 @@ snapshots: object.values: 1.2.0 semver: 6.3.1 tsconfig-paths: 3.15.0 + optionalDependencies: + '@typescript-eslint/parser': 7.18.0(eslint@9.8.0)(typescript@5.5.4) transitivePeerDependencies: - eslint-import-resolver-typescript - eslint-import-resolver-webpack - supports-color - eslint-plugin-jest@28.6.0(@typescript-eslint/eslint-plugin@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint@9.8.0)(jest@29.7.0(@types/node@18.19.42)(ts-node@10.9.2(@types/node@18.19.42)(typescript@5.5.4)))(typescript@5.5.4): + eslint-plugin-jest@28.6.0(@typescript-eslint/eslint-plugin@7.18.0(@typescript-eslint/parser@7.18.0(eslint@9.8.0)(typescript@5.5.4))(eslint@9.8.0)(typescript@5.5.4))(eslint@9.8.0)(jest@29.7.0(@types/node@18.19.42)(ts-node@10.9.2(@types/node@18.19.42)(typescript@5.5.4)))(typescript@5.5.4): dependencies: '@typescript-eslint/utils': 7.18.0(eslint@9.8.0)(typescript@5.5.4) eslint: 9.8.0