Skip to content

Commit

Permalink
feat: introduce matchSource match template variable
Browse files Browse the repository at this point in the history
- Change default fuzzy match pane template with matchSource
- Add method MatchesVarExpansion#expandMatchSource
- Extend NearString.MATCH_SOURCE to have TM_SUBSEG
- Update test expectations of MatchesTextAreaTest, and FindMatchesTest
- Add human-readable names of MATCH_SOURCE in Bundle.properties

Signed-off-by: Hiroshi Miura <[email protected]>
  • Loading branch information
miurahr committed Dec 17, 2024
1 parent ecb65e3 commit 821d6a9
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 21 deletions.
5 changes: 5 additions & 0 deletions src/org/omegat/Bundle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2947,3 +2947,8 @@ DICTIONARY_LOAD_FILE=Loaded dictionary from '{0}': {1} ms
DICTIONARY_LOAD_ERROR=Error load dictionary from '{0}': {1}
DICTIONARY_MANAGER_ERROR_SAVE_IGNORE=Error saving ignore words"
EDITOR_CONTROLLER_EXCEPTION=bad location exception when changing case

MATCHES_COMES_FROM_TM=From TM
MATCHES_COMES_FROM_FILES=Files
MATCHES_COMES_FROM_MEMORY=From Project
MATCHES_COMES_FROM_TM_SUBSEG=Sub-segmented match
7 changes: 5 additions & 2 deletions src/org/omegat/core/matching/NearString.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,11 @@
*/
public class NearString {
public enum MATCH_SOURCE {
MEMORY, TM, FILES
};
MEMORY,
TM,
FILES,
TM_SUBSEG;
}

public enum SORT_KEY {
SCORE, SCORE_NO_STEM, ADJUSTED_SCORE
Expand Down
61 changes: 46 additions & 15 deletions src/org/omegat/core/statistics/FindMatches.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,20 +63,19 @@

/**
* Class to find matches by specified criteria.
*
* <p>
* Since we can use stemmers to prepare tokens, we should use 3-pass comparison
* of similarity. Similarity will be calculated in 3 steps:
*
* 1. Split original segment into word-only tokens using stemmer (with stop
* words list), then compare tokens.
*
* 2. Split original segment into word-only tokens without stemmer, then compare
* tokens.
*
* 3. Split original segment into not-only-words tokens (including numbers and
* tags) without stemmer, then compare tokens.
*
* This class is not thread safe ! Must be used in the one thread only.
* <ol>
* <li>Split the original segment into word-only tokens using stemmer (with stop
* words list), then compare tokens.</li>
* <li>Split the original segment into word-only tokens without a stemmer,
* then compare tokens.</li>
* <li>Split the original segment into not-only-words tokens (including numbers
* and tags) without a stemmer, then compare tokens.</li>
* </ol>
* <p>
* This class is not thread safe! Must be used in the one thread only.
*
* @author Maxym Mykhalchuk
* @author Alex Buloichik ([email protected])
Expand Down Expand Up @@ -150,6 +149,23 @@ public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentM
OConsts.FUZZY_MATCH_THRESHOLD));
}

/**
* FindMatches find fuzzy matched translation memories.
*
* @param project
* OmegaT project.
* @param segmenter
* used when running a segmentation search.
* @param maxCount
* limit the maximum count of the results.
* @param searchExactlyTheSame
* allows searching similarities with the same text as a source
* segment. This mode is used only for separate sentence match
* in a paragraph project, i.e., where a source is just part of
* the current source.
* @param threshold
* threshold to use.
*/
public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean allowSeparateSegmentMatch,
boolean searchExactlyTheSame, boolean applyThreshold, int threshold) {
this.project = project;
Expand All @@ -165,6 +181,20 @@ public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean
this.applyThreshold = applyThreshold;
}

/**
* Search Translation memories.
*
* @param searchText
* target segment or term to search.
* @param fillSimilarityData
* fill similarity data into the result of NearString objects.
* @param stop
* IStopped callback object to indicate cancel operation.
* @return
* List of NearString objects, which hold matched translation entry.
* @throws StoppedException
* raised when stopped during a search process.
*/
public List<NearString> search(String searchText, boolean requiresTranslation, boolean fillSimilarityData,
IStopped stop) throws StoppedException {
result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
Expand Down Expand Up @@ -235,8 +265,8 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
for (ITMXEntry tmen : en.getValue().getEntries()) {
checkStopped(stop);
if (tmen.getSourceText() == null) {
// Not all TMX entries have a source; in that case there can
// be no meaningful match, so skip.
// Not all TMX entries have a source; skip it in
// the case, because of no meaningful.
continue;
}
if (requiresTranslation && tmen.getTranslationText() == null) {
Expand All @@ -249,6 +279,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
processEntry(null, tmen, en.getKey(), NearString.MATCH_SOURCE.TM, false, tmenPenalty);
}
}

// travel by all entries for check source file translations
for (SourceTextEntry ste : project.getAllEntries()) {
checkStopped(stop);
Expand Down Expand Up @@ -385,7 +416,7 @@ public void processEntry(EntryKey key, ITMXEntry entry, String tmxName,
}

// BUGS#1236 - stat display does not use threshold config check
if (applyThreshold && similarityStem < fuzzyMatchThreshold
if (fuzzyMatchThreshold > 0 && similarityStem < fuzzyMatchThreshold
&& similarityNoStem < fuzzyMatchThreshold && simAdjusted < fuzzyMatchThreshold) {
return;
}
Expand Down
25 changes: 23 additions & 2 deletions src/org/omegat/gui/matches/MatchesVarExpansion.java
Original file line number Diff line number Diff line change
Expand Up @@ -90,19 +90,21 @@ public class MatchesVarExpansion extends VarExpansion<NearString> {
public static final String VAR_DIFF_REVERSED = "${diffReversed}";
public static final String VAR_SOURCE_LANGUAGE = "${sourceLanguage}";
public static final String VAR_TARGET_LANGUAGE = "${targetLanguage}";
public static final String VAR_MATCH_SOURCE = "${matchSource}";

private static final String[] MATCHES_VARIABLES = { VAR_ID, VAR_SOURCE_TEXT, VAR_DIFF, VAR_DIFF_REVERSED,
VAR_TARGET_TEXT, VAR_SCORE_BASE, VAR_SCORE_NOSTEM, VAR_SCORE_ADJUSTED, VAR_FILE_NAME_ONLY,
VAR_FILE_PATH, VAR_FILE_SHORT_PATH, VAR_INITIAL_CREATION_ID, VAR_INITIAL_CREATION_DATE,
VAR_CHANGED_ID, VAR_CHANGED_DATE, VAR_FUZZY_FLAG, VAR_SOURCE_LANGUAGE, VAR_TARGET_LANGUAGE };
VAR_CHANGED_ID, VAR_CHANGED_DATE, VAR_FUZZY_FLAG, VAR_SOURCE_LANGUAGE, VAR_TARGET_LANGUAGE,
VAR_MATCH_SOURCE };

public static List<String> getMatchesVariables() {
return Collections.unmodifiableList(Arrays.asList(MATCHES_VARIABLES));
}

public static final String DEFAULT_TEMPLATE = VAR_ID + ". " + VAR_FUZZY_FLAG + VAR_SOURCE_TEXT + "\n"
+ VAR_TARGET_TEXT + "\n" + "<" + VAR_SCORE_BASE + "/" + VAR_SCORE_NOSTEM + "/"
+ VAR_SCORE_ADJUSTED + "% " + VAR_FILE_PATH + ">";
+ VAR_SCORE_ADJUSTED + "%" + VAR_MATCH_SOURCE + VAR_FILE_PATH + ">";

public static final Pattern PATTERN_SINGLE_PROPERTY = Pattern.compile("@\\{(.+?)\\}");
public static final Pattern PATTERN_PROPERTY_GROUP = Pattern
Expand Down Expand Up @@ -222,6 +224,22 @@ private String getPropValue(List<TMXProp> props, String type) {
return null;
}

private String expandMatchSource(String localTemplate, NearString.MATCH_SOURCE comesFrom) {
switch(comesFrom) {
case TM:
return localTemplate.replace(VAR_MATCH_SOURCE, OStrings.getString("MATCHES_COMES_FROM_TM") + " ");
case FILES:
return localTemplate.replace(VAR_MATCH_SOURCE, OStrings.getString("MATCHES_COMES_FROM_FILES"));
case MEMORY:
return localTemplate.replace(VAR_MATCH_SOURCE, OStrings.getString( "MATCHES_COMES_FROM_MEMORY"));
case TM_SUBSEG:
return localTemplate.replace(VAR_MATCH_SOURCE, OStrings.getString(
"MATCHES_COMES_FROM_TM_SUBSEG") + " ");
default:
return localTemplate.replace(VAR_MATCH_SOURCE, "");
}
}

@Override
public String expandVariables(NearString match) {
// do not modify template directly, so that we can reuse for another
Expand Down Expand Up @@ -285,6 +303,9 @@ public String expandVariables(NearString match) {
} else {
localTemplate = localTemplate.replace(VAR_TARGET_TEXT, match.translation);
}

localTemplate = expandMatchSource(localTemplate, match.comesFrom);

return localTemplate;
}

Expand Down
2 changes: 1 addition & 1 deletion test/src/org/omegat/core/statistics/FindMatchesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2021 Hiroshi Miura
Copyright (C) 2021-2024 Hiroshi Miura
Home page: https://www.omegat.org/
Support center: https://omegat.org/support
Expand Down
Empty file.
3 changes: 2 additions & 1 deletion test/src/org/omegat/gui/matches/MatchesVarExpansionTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
import org.omegat.core.data.ProjectProperties;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.matching.NearString;
import org.omegat.core.matching.NearString.MATCH_SOURCE;
import org.omegat.gui.editor.IEditor;
import org.omegat.gui.editor.IEditorFilter;
import org.omegat.gui.editor.IEditorSettings;
Expand Down Expand Up @@ -208,7 +209,7 @@ public NearString getMockNearString() {
entry.changeDate = 20020523;
entry.otherProperties = testProps;
NearString.Scores scores = new NearString.Scores(20, 40, 60);
return new NearString(null, entry, null, false, scores, null, "mock testing project");
return new NearString(null, entry, MATCH_SOURCE.TM, false, scores, null, "mock testing project");
};

private void setupProject(Language sourceLanguage, Language targetLanguage) {
Expand Down

0 comments on commit 821d6a9

Please sign in to comment.