-
-
Notifications
You must be signed in to change notification settings - Fork 113
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: introduce matchSource match template variable
- Change default fuzzy match pane template with matchSource - Add method MatchesVarExpansion#expandMatchSource - Extend NearString.MATCH_SOURCE to have TM_SUBSEG - Update test expectations of MatchesTextAreaTest, and FindMatchesTest - Add human-readable names of MATCH_SOURCE in Bundle.properties Signed-off-by: Hiroshi Miura <[email protected]>
- Loading branch information
Showing
7 changed files
with
82 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,20 +63,19 @@ | |
|
||
/** | ||
* Class to find matches by specified criteria. | ||
* | ||
* <p> | ||
* Since we can use stemmers to prepare tokens, we should use 3-pass comparison | ||
* of similarity. Similarity will be calculated in 3 steps: | ||
* | ||
* 1. Split original segment into word-only tokens using stemmer (with stop | ||
* words list), then compare tokens. | ||
* | ||
* 2. Split original segment into word-only tokens without stemmer, then compare | ||
* tokens. | ||
* | ||
* 3. Split original segment into not-only-words tokens (including numbers and | ||
* tags) without stemmer, then compare tokens. | ||
* | ||
* This class is not thread safe ! Must be used in the one thread only. | ||
* <ol> | ||
* <li>Split the original segment into word-only tokens using stemmer (with stop | ||
* words list), then compare tokens.</li> | ||
* <li>Split the original segment into word-only tokens without a stemmer, | ||
* then compare tokens.</li> | ||
* <li>Split the original segment into not-only-words tokens (including numbers | ||
* and tags) without a stemmer, then compare tokens.</li> | ||
* </ol> | ||
* <p> | ||
* This class is not thread safe! Must be used in the one thread only. | ||
* | ||
* @author Maxym Mykhalchuk | ||
* @author Alex Buloichik ([email protected]) | ||
|
@@ -150,6 +149,23 @@ public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentM | |
OConsts.FUZZY_MATCH_THRESHOLD)); | ||
} | ||
|
||
/** | ||
* FindMatches find fuzzy matched translation memories. | ||
* | ||
* @param project | ||
* OmegaT project. | ||
* @param segmenter | ||
* used when running a segmentation search. | ||
* @param maxCount | ||
* limit the maximum count of the results. | ||
* @param searchExactlyTheSame | ||
* allows searching similarities with the same text as a source | ||
* segment. This mode is used only for separate sentence match | ||
* in a paragraph project, i.e., where a source is just part of | ||
* the current source. | ||
* @param threshold | ||
* threshold to use. | ||
*/ | ||
public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean allowSeparateSegmentMatch, | ||
boolean searchExactlyTheSame, boolean applyThreshold, int threshold) { | ||
this.project = project; | ||
|
@@ -165,6 +181,20 @@ public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean | |
this.applyThreshold = applyThreshold; | ||
} | ||
|
||
/** | ||
* Search Translation memories. | ||
* | ||
* @param searchText | ||
* target segment or term to search. | ||
* @param fillSimilarityData | ||
* fill similarity data into the result of NearString objects. | ||
* @param stop | ||
* IStopped callback object to indicate cancel operation. | ||
* @return | ||
* List of NearString objects, which hold matched translation entry. | ||
* @throws StoppedException | ||
* raised when stopped during a search process. | ||
*/ | ||
public List<NearString> search(String searchText, boolean requiresTranslation, boolean fillSimilarityData, | ||
IStopped stop) throws StoppedException { | ||
result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1); | ||
|
@@ -235,8 +265,8 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b | |
for (ITMXEntry tmen : en.getValue().getEntries()) { | ||
checkStopped(stop); | ||
if (tmen.getSourceText() == null) { | ||
// Not all TMX entries have a source; in that case there can | ||
// be no meaningful match, so skip. | ||
// Not all TMX entries have a source; skip it in | ||
// the case, because of no meaningful. | ||
continue; | ||
} | ||
if (requiresTranslation && tmen.getTranslationText() == null) { | ||
|
@@ -249,6 +279,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b | |
processEntry(null, tmen, en.getKey(), NearString.MATCH_SOURCE.TM, false, tmenPenalty); | ||
} | ||
} | ||
|
||
// travel by all entries for check source file translations | ||
for (SourceTextEntry ste : project.getAllEntries()) { | ||
checkStopped(stop); | ||
|
@@ -385,7 +416,7 @@ public void processEntry(EntryKey key, ITMXEntry entry, String tmxName, | |
} | ||
|
||
// BUGS#1236 - stat display does not use threshold config check | ||
if (applyThreshold && similarityStem < fuzzyMatchThreshold | ||
if (fuzzyMatchThreshold > 0 && similarityStem < fuzzyMatchThreshold | ||
&& similarityNoStem < fuzzyMatchThreshold && simAdjusted < fuzzyMatchThreshold) { | ||
return; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters