From 9a03a8b94163e9059d8f7b7289917bebc8432696 Mon Sep 17 00:00:00 2001 From: George Rhoten Date: Mon, 6 Jan 2025 14:24:36 -0800 Subject: [PATCH] ICU-22979 Support inverse rule for [] span in RBNF --- icu4c/source/i18n/nfrs.cpp | 8 +- icu4c/source/i18n/nfrule.cpp | 14 +- icu4c/source/i18n/rbnf.cpp | 27 +- icu4c/source/i18n/unicode/rbnf.h | 260 +++++++------- icu4c/source/test/intltest/itrbnf.cpp | 89 ++++- icu4c/source/test/intltest/itrbnf.h | 257 +++++++------- .../com/ibm/icu/dev/test/format/RbnfTest.java | 85 +++++ .../main/java/com/ibm/icu/text/NFRule.java | 19 +- .../main/java/com/ibm/icu/text/NFRuleSet.java | 12 +- .../ibm/icu/text/RuleBasedNumberFormat.java | 328 ++++++++---------- .../resources/com/ibm/icu/text/package.html | 2 +- 11 files changed, 629 insertions(+), 472 deletions(-) diff --git a/icu4c/source/i18n/nfrs.cpp b/icu4c/source/i18n/nfrs.cpp index be2ab2932e7a..b7ffb561461b 100644 --- a/icu4c/source/i18n/nfrs.cpp +++ b/icu4c/source/i18n/nfrs.cpp @@ -152,7 +152,7 @@ NFRuleSet::NFRuleSet(RuleBasedNumberFormat *_owner, UnicodeString* descriptions, UnicodeString& description = descriptions[index]; // !!! make sure index is valid - if (description.length() == 0) { + if (description.isEmpty()) { // throw new IllegalArgumentException("Empty rule set description"); status = U_PARSE_ERROR; return; @@ -177,16 +177,16 @@ NFRuleSet::NFRuleSet(RuleBasedNumberFormat *_owner, UnicodeString* descriptions, name.setTo(UNICODE_STRING_SIMPLE("%default")); } - if (description.length() == 0) { + if (description.isEmpty()) { // throw new IllegalArgumentException("Empty rule set description"); status = U_PARSE_ERROR; } fIsPublic = name.indexOf(gPercentPercent, 2, 0) != 0; - if ( name.endsWith(gNoparse,8) ) { + if (name.endsWith(gNoparse, 8)) { fIsParseable = false; - name.truncate(name.length()-8); // remove the @noparse from the name + name.truncate(name.length() - 8); // remove the @noparse from the name } // all of the other members of NFRuleSet are initialized diff --git a/icu4c/source/i18n/nfrule.cpp b/icu4c/source/i18n/nfrule.cpp index a2400a6421bd..473873ce8764 100644 --- a/icu4c/source/i18n/nfrule.cpp +++ b/icu4c/source/i18n/nfrule.cpp @@ -64,6 +64,7 @@ NFRule::~NFRule() static const char16_t gLeftBracket = 0x005b; static const char16_t gRightBracket = 0x005d; +static const char16_t gVerticalLine = 0x007C; static const char16_t gColon = 0x003a; static const char16_t gZero = 0x0030; static const char16_t gNine = 0x0039; @@ -146,6 +147,7 @@ NFRule::makeRules(UnicodeString& description, // then it's really shorthand for two rules (with one exception) LocalPointer rule2; UnicodeString sbuf; + int32_t orElseOp = description.indexOf(gVerticalLine); // we'll actually only split the rule into two rules if its // base value is an even multiple of its divisor (or it's one @@ -196,6 +198,9 @@ NFRule::makeRules(UnicodeString& description, // rule2's rule text omits the stuff in brackets: initialize // its rule text and substitutions accordingly sbuf.append(description, 0, brack1); + if (orElseOp >= 0) { + sbuf.append(description, orElseOp + 1, brack2 - orElseOp - 1); + } if (brack2 + 1 < description.length()) { sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); } @@ -206,7 +211,12 @@ NFRule::makeRules(UnicodeString& description, // the brackets themselves: initialize _its_ rule text and // substitutions accordingly sbuf.setTo(description, 0, brack1); - sbuf.append(description, brack1 + 1, brack2 - brack1 - 1); + if (orElseOp >= 0) { + sbuf.append(description, brack1 + 1, orElseOp - brack1 - 1); + } + else { + sbuf.append(description, brack1 + 1, brack2 - brack1 - 1); + } if (brack2 + 1 < description.length()) { sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); } @@ -404,7 +414,7 @@ NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status) // finally, if the rule body begins with an apostrophe, strip it off // (this is generally used to put whitespace at the beginning of // a rule's rule text) - if (description.length() > 0 && description.charAt(0) == gTick) { + if (!description.isEmpty() && description.charAt(0) == gTick) { description.removeBetween(0, 1); } diff --git a/icu4c/source/i18n/rbnf.cpp b/icu4c/source/i18n/rbnf.cpp index c4e8ff73a7cc..5b6b5e2c1892 100644 --- a/icu4c/source/i18n/rbnf.cpp +++ b/icu4c/source/i18n/rbnf.cpp @@ -1568,12 +1568,12 @@ RuleBasedNumberFormat::init(const UnicodeString& rules, LocalizationInfo* locali // divide up the descriptions into individual rule-set descriptions // and store them in a temporary array. At each step, we also - // new up a rule set, but all this does is initialize its name + // create a rule set, but all this does is initialize its name // and remove it from its description. We can't actually parse // the rest of the descriptions and finish initializing everything // because we have to know the names and locations of all the rule // sets before we can actually set everything up - if(!numRuleSets) { + if (!numRuleSets) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } @@ -1616,9 +1616,9 @@ RuleBasedNumberFormat::init(const UnicodeString& rules, LocalizationInfo* locali // last public rule set, no matter what the localization data says. initDefaultRuleSet(); - // finally, we can go back through the temporary descriptions - // list and finish setting up the substructure (and we throw - // away the temporary descriptions as we go) + // Now that we know all the rule names, we can go back through + // the temporary descriptions list and finish setting up the substructure + // (and we throw away the temporary descriptions as we go) { for (int i = 0; i < numRuleSets; i++) { fRuleSets[i]->parseRules(ruleSetDescriptions[i], status); @@ -1706,10 +1706,13 @@ RuleBasedNumberFormat::stripWhitespace(UnicodeString& description) UnicodeString result; int start = 0; - while (start != -1 && start < description.length()) { - // seek to the first non-whitespace character... + UChar ch; + while (start < description.length()) { + // Seek to the first non-whitespace character... + // If the first non-whitespace character is semicolon, skip it and continue while (start < description.length() - && PatternProps::isWhiteSpace(description.charAt(start))) { + && (PatternProps::isWhiteSpace(ch = description.charAt(start)) || ch == gSemiColon)) + { ++start; } @@ -1720,20 +1723,16 @@ RuleBasedNumberFormat::stripWhitespace(UnicodeString& description) // or if we don't find a semicolon, just copy the rest of // the string into the result result.append(description, start, description.length() - start); - start = -1; + break; } else if (p < description.length()) { result.append(description, start, p + 1 - start); start = p + 1; } - - // when we get here, we've seeked off the end of the string, and + // when we get here from the else, we've seeked off the end of the string, and // we terminate the loop (we continue until *start* is -1 rather // than until *p* is -1, because otherwise we'd miss the last // rule in the description) - else { - start = -1; - } } description.setTo(result); diff --git a/icu4c/source/i18n/unicode/rbnf.h b/icu4c/source/i18n/unicode/rbnf.h index f42d91d776fe..bc5e2fef33da 100644 --- a/icu4c/source/i18n/unicode/rbnf.h +++ b/icu4c/source/i18n/unicode/rbnf.h @@ -94,17 +94,17 @@ enum URBNFRuleSetTag { * cents soixante-seize" or * "fünfundzwanzigtausenddreihundertsechsundsiebzig"), but can also be used for * other complicated formatting tasks, such as formatting a number of seconds as hours, - * minutes and seconds (e.g., 3,730 as "1:02:10"). + * minutes and seconds (e.g., 3,730 as "1:02:10").

* *

The resources contain three predefined formatters for each locale: spellout, which * spells out a value in words (123 is "one hundred twenty-three"); ordinal, which * appends an ordinal suffix to the end of a numeral (123 is "123rd"); and * duration, which shows a duration in seconds as hours, minutes, and seconds (123 is - * "2:03").  The client can also define more specialized RuleBasedNumberFormats + * "2:03").  The client can also define more specialized RuleBasedNumberFormats * by supplying programmer-defined rule sets.

* - *

The behavior of a RuleBasedNumberFormat is specified by a textual description - * that is either passed to the constructor as a String or loaded from a resource + *

The behavior of a RuleBasedNumberFormat is specified by a textual description + * that is either passed to the constructor as a String or loaded from a resource * bundle. In its simplest form, the description consists of a semicolon-delimited list of rules. * Each rule has a string of output text and a value or range of values it is applicable to. * In a typical spellout rule set, the first twenty rules are the words for the numbers from @@ -116,7 +116,8 @@ enum URBNFRuleSetTag { *

For larger numbers, we can use the preceding set of rules to format the ones place, and * we only have to supply the words for the multiples of 10:

* - *
 20: twenty[->>];
+ * 
+ * 20: twenty[->>];
  * 30: thirty[->>];
  * 40: forty[->>];
  * 50: fifty[->>];
@@ -137,7 +138,8 @@ enum URBNFRuleSetTag {
  * 

For even larger numbers, we can actually look up several parts of the number in the * list:

* - *
100: << hundred[ >>];
+ *
+ * 100: << hundred[ >>];
* *

The "<<" represents a new kind of substitution. The << isolates * the hundreds digit (and any digits to its left), formats it using this same rule set, and @@ -155,13 +157,15 @@ enum URBNFRuleSetTag { * *

This rule covers values up to 999, at which point we add another rule:

* - *
1000: << thousand[ >>];
+ *
+ * 1000: << thousand[ >>];
* *

Again, the meanings of the brackets and substitution tokens shift because the rule's * base value is a higher power of 10, changing the rule's divisor. This rule can actually be * used all the way up to 999,999. This allows us to finish out the rules as follows:

* - *
 1,000,000: << million[ >>];
+ * 
+ * 1,000,000: << million[ >>];
  * 1,000,000,000: << billion[ >>];
  * 1,000,000,000,000: << trillion[ >>];
  * 1,000,000,000,000,000: OUT OF RANGE!;
@@ -177,30 +181,30 @@ enum URBNFRuleSetTag { *

To see how these rules actually work in practice, consider the following example: * Formatting 25,430 with this rule set would work like this:

* - * + *
* - * - * + * + * * * - * - * + * + * * * - * - * + * + * * * - * - * + * + * * * - * - * + * + * * * - * - * + * * *
<< thousand >>[the rule whose base value is 1,000 is applicable to 25,340]<< thousand >>[the rule whose base value is 1,000 is applicable to 25,340]
twenty->> thousand >>[25,340 over 1,000 is 25. The rule for 20 applies.]twenty->> thousand >>[25,340 over 1,000 is 25. The rule for 20 applies.]
twenty-five thousand >>[25 mod 10 is 5. The rule for 5 is "five."twenty-five thousand >>[25 mod 10 is 5. The rule for 5 is "five."
twenty-five thousand << hundred >>[25,340 mod 1,000 is 340. The rule for 100 applies.]twenty-five thousand << hundred >>[25,340 mod 1,000 is 340. The rule for 100 applies.]
twenty-five thousand three hundred >>[340 over 100 is 3. The rule for 3 is "three."]twenty-five thousand three hundred >>[340 over 100 is 3. The rule for 3 is "three."]
twenty-five thousand three hundred forty[340 mod 100 is 40. The rule for 40 applies. Since 40 divides + * twenty-five thousand three hundred forty[340 mod 100 is 40. The rule for 40 applies. Since 40 divides * evenly by 10, the hyphen and substitution in the brackets are omitted.]
@@ -237,20 +241,20 @@ enum URBNFRuleSetTag { * *
* - *

The description of a RuleBasedNumberFormat's behavior consists of one or more rule + *

The description of a RuleBasedNumberFormat's behavior consists of one or more rule * sets. Each rule set consists of a name, a colon, and a list of rules. A rule * set name must begin with a % sign. Rule sets with names that begin with a single % sign * are public: the caller can specify that they be used to format and parse numbers. * Rule sets with names that begin with %% are private: they exist only for the use * of other rule sets. If a formatter only has one rule set, the name may be omitted.

* - *

The user can also specify a special "rule set" named %%lenient-parse. - * The body of %%lenient-parse isn't a set of number-formatting rules, but a RuleBasedCollator + *

The user can also specify a special "rule set" named %%lenient-parse. + * The body of %%lenient-parse isn't a set of number-formatting rules, but a RuleBasedCollator * description which is used to define equivalences for lenient parsing. For more information - * on the syntax, see RuleBasedCollator. For more information on lenient parsing, - * see setLenientParse(). Note: symbols that have syntactic meaning + * on the syntax, see RuleBasedCollator. For more information on lenient parsing, + * see setLenientParse(). Note: symbols that have syntactic meaning * in collation rules, such as '&', have no particular meaning when appearing outside - * of the lenient-parse rule set.

+ * of the lenient-parse rule set.

* *

The body of a rule set consists of an ordered, semicolon-delimited list of rules. * Internally, every rule has a base value, a divisor, rule text, and zero, one, or two substitutions. @@ -260,42 +264,46 @@ enum URBNFRuleSetTag { *

A rule descriptor can take one of the following forms (text in italics is the * name of a token):

* - * + *
* - * - * + * + * + * + * + * * - * - * - * + * + * * - * - * - * + * + * * - * - * - * + * + * * - * - * - * + * + * + * * - * - * - * + * + * * - * - * - * + * + * + * the punctuation of either the full stop or comma * - * - * - * + * + * + * the punctuation of either the full stop or comma * - * - * - * + * + * + * * - * - * - * + * + * + * * - * - * - * + * + * @@ -352,8 +360,8 @@ enum URBNFRuleSetTag { * algorithms: If the rule set is a regular rule set, do the following: * *
    - *
  • If the rule set includes a default rule (and the number was passed in as a double), - * use the default rule.  (If the number being formatted was passed in as a long, + *
  • If the rule set includes a default rule (and the number was passed in as a double), + * use the default rule.  (If the number being formatted was passed in as a long, * the default rule is ignored.)
  • *
  • If the number is negative, use the negative-number rule.
  • *
  • If the number has a fractional part and is greater than 1, use the improper fraction @@ -400,42 +408,43 @@ enum URBNFRuleSetTag { * *

    The meanings of the substitution token characters are as follows:

    * - *
bv:bv specifies the rule's base value. bv is a decimal + * DescriptorDescription
bv:bv specifies the rule's base value. bv is a decimal * number expressed using ASCII digits. bv may contain spaces, period, and commas, * which are ignored. The rule's divisor is the highest power of 10 less than or equal to * the base value.
bv/rad:bv specifies the rule's base value. The rule's divisor is the + *
bv/rad:bv specifies the rule's base value. The rule's divisor is the * highest power of rad less than or equal to the base value.
bv>:bv specifies the rule's base value. To calculate the divisor, + *
bv>:bv specifies the rule's base value. To calculate the divisor, * let the radix be 10, and the exponent be the highest exponent of the radix that yields a * result less than or equal to the base value. Every > character after the base value * decreases the exponent by 1. If the exponent is positive or 0, the divisor is the radix * raised to the power of the exponent; otherwise, the divisor is 1.
bv/rad>:bv specifies the rule's base value. To calculate the divisor, + *
bv/rad>:bv specifies the rule's base value. To calculate the divisor, * let the radix be rad, and the exponent be the highest exponent of the radix that * yields a result less than or equal to the base value. Every > character after the radix * decreases the exponent by 1. If the exponent is positive or 0, the divisor is the radix * raised to the power of the exponent; otherwise, the divisor is 1.
-x:The rule is a negative-number rule.
-x:The rule is a negative-number rule.
x.x:The rule is an improper fraction rule. If the full stop in + *
x.x:The rule is an improper fraction rule. If the full stop in * the middle of the rule name is replaced with the decimal point * that is used in the language or DecimalFormatSymbols, then that rule will * have precedence when formatting and parsing this rule. For example, some @@ -304,39 +312,39 @@ enum URBNFRuleSetTag { * handle the decimal point that matches the language's natural spelling of * the punctuation of either the full stop or comma.
0.x:The rule is a proper fraction rule. If the full stop in + *
0.x:The rule is a proper fraction rule. If the full stop in * the middle of the rule name is replaced with the decimal point * that is used in the language or DecimalFormatSymbols, then that rule will * have precedence when formatting and parsing this rule. For example, some * languages use the comma, and can thus be written as 0,x instead. For example, * you can use "0.x: point >>;0,x: comma >>;" to * handle the decimal point that matches the language's natural spelling of - * the punctuation of either the full stop or comma.
x.0:The rule is a default rule. If the full stop in + *
x.0:The rule is a default rule. If the full stop in * the middle of the rule name is replaced with the decimal point * that is used in the language or DecimalFormatSymbols, then that rule will * have precedence when formatting and parsing this rule. For example, some * languages use the comma, and can thus be written as x,0 instead. For example, * you can use "x.0: << point;x,0: << comma;" to * handle the decimal point that matches the language's natural spelling of - * the punctuation of either the full stop or comma.
Inf:The rule for infinity.
Inf:The rule for infinity.
NaN:The rule for an IEEE 754 NaN (not a number).
NaN:The rule for an IEEE 754 NaN (not a number).
nothingIf the rule's rule descriptor is left out, the base value is one plus the + *
nothingIf the rule's rule descriptor is left out, the base value is one plus the * preceding rule's base value (or zero if this is the first rule in the list) in a normal * rule set.  In a fraction rule set, the base value is the same as the preceding rule's * base value.
+ *
* - * - * + * + * + * + * + * + * + * * * * - * - * + * * * * - * - * + * * * * - * - * + * * * - * - * - * + * + * + * * * * - * - * + * * * - * - * - * + * + * + * * * * - * - * + * * * * - * - * + * * * * - * - * + * * * - * - * - * + * + * + * * * - * - * - * - * + * + * + * + * * * - * - * + * * * * - * - * - * + * + * * * - * - * - * + * + * * * - * - * + * * * * - * - * - * + * + * * - * - * - * - * + * + * + * * * - * - * - * - * + * + * + * * * *
>>in normal ruleSyntaxUsageDescription
>>in normal ruleDivide the number by the rule's divisor and format the remainder
in negative-number rulein negative-number ruleFind the absolute value of the number and format the result
in fraction or default rulein fraction or default ruleIsolate the number's fractional part and format it.
in rule in fraction rule setin rule in fraction rule setNot allowed.
>>>in normal rule
>>>in normal ruleDivide the number by the rule's divisor and format the remainder, * but bypass the normal rule-selection process and just use the * rule that precedes this one in this rule list.
in all other rulesin all other rulesNot allowed.
<<in normal rule
<<in normal ruleDivide the number by the rule's divisor, perform floor() on the quotient, * and format the resulting value.
* If there is a DecimalFormat pattern between the < characters and the @@ -448,73 +457,67 @@ enum URBNFRuleSetTag { *
in negative-number rulein negative-number ruleNot allowed.
in fraction or default rulein fraction or default ruleIsolate the number's integral part and format it.
in rule in fraction rule setin rule in fraction rule setMultiply the number by the rule's base value and format the result.
==in all rule sets
==in all rule setsFormat the number unchanged
[]in normal ruleOmit the optional text if the number is an even multiple of the rule's divisor
[]in normal ruleOmit the optional text and rules if the number is an even multiple of the rule's divisor. + * If the | symbol is present, the text and rules after the symbol and before the right bracket will be used instead of omission.
in negative-number rulein negative-number ruleNot allowed.
in improper-fraction ruleOmit the optional text if the number is between 0 and 1 (same as specifying both an - * x.x rule and a 0.x rule)in improper-fraction ruleOmit the optional text and rules if the number is between 0 and 1 (same as specifying both an + * x.x rule and a 0.x rule). + * If the | symbol is present, the text and rules after the symbol and before the right bracket will be used instead of omission.
in default ruleOmit the optional text if the number is an integer (same as specifying both an x.x - * rule and an x.0 rule)in default ruleOmit the optional text and rules if the number is an integer (same as specifying both an x.x + * rule and an x.0 rule). + * If the | symbol is present, the text and rules after the symbol and before the right bracket will be used instead of omission.
in proper-fraction rulein proper-fraction ruleNot allowed.
in rule in fraction rule setOmit the optional text if multiplying the number by the rule's base value yields 1.in rule in fraction rule setOmit the optional text and rules if multiplying the number by the rule's base value yields 1. + * If the | symbol is present, the text and rules after the symbol and before the right bracket will be used instead of omission.
$(cardinal,plural syntax)$in all rule sets
$(cardinal,plural syntax)$in all rule setsThis provides the ability to choose a word based on the number divided by the radix to the power of the * exponent of the base value for the specified locale, which is normally equivalent to the << value. - * This uses the cardinal plural rules from PluralFormat. All strings used in the plural format are treated + * This uses the cardinal plural rules from {@link PluralFormat}. All strings used in the plural format are treated * as the same base value for parsing.
$(ordinal,plural syntax)$in all rule sets
$(ordinal,plural syntax)$in all rule setsThis provides the ability to choose a word based on the number divided by the radix to the power of the * exponent of the base value for the specified locale, which is normally equivalent to the << value. - * This uses the ordinal plural rules from PluralFormat. All strings used in the plural format are treated + * This uses the ordinal plural rules from {@link PluralFormat}. All strings used in the plural format are treated * as the same base value for parsing.
@@ -522,22 +525,25 @@ enum URBNFRuleSetTag { *

The substitution descriptor (i.e., the text between the token characters) may take one * of three forms:

* - * + *
+ * + * + * + * * - * + * * * - * - * + * + * * * - * - * + * + * *
DescriptorDescription
a rule set namea rule set namePerform the mathematical operation on the number, and format the result using the * named rule set.
a DecimalFormat pattern
a DecimalFormat patternPerform the mathematical operation on the number, and format the result using a * DecimalFormat with the specified pattern.  The pattern must begin with 0 or #.
nothing
nothingPerform the mathematical operation on the number, and format the result using the rule - * set containing the current rule, except: - *
    + * set containing the current rule, except:
      *
    • You can't have an empty substitution descriptor with a == substitution.
    • *
    • If you omit the substitution descriptor in a >> substitution in a fraction rule, * format the result one digit at a time using the rule set containing the current rule.
    • diff --git a/icu4c/source/test/intltest/itrbnf.cpp b/icu4c/source/test/intltest/itrbnf.cpp index 8508c32e98a7..aac33618e964 100644 --- a/icu4c/source/test/intltest/itrbnf.cpp +++ b/icu4c/source/test/intltest/itrbnf.cpp @@ -81,8 +81,9 @@ void IntlTestRBNF::runIndexedTest(int32_t index, UBool exec, const char* &name, TESTCASE(29, TestNumberingSystem); TESTCASE(30, TestDFRounding); TESTCASE(31, TestMemoryLeak22899); - TESTCASE(32, TestInfiniteRecursion); - TESTCASE(33, TestParseRuleDescriptorOverflow23002); + TESTCASE(32, TestParseRuleDescriptorOverflow23002); + TESTCASE(33, TestInfiniteRecursion); + TESTCASE(34, testOmissionReplacementWithPluralRules); #else TESTCASE(0, TestRBNFDisabled); #endif @@ -2661,6 +2662,90 @@ IntlTestRBNF::TestInfiniteRecursion() { } } +void +IntlTestRBNF::testOmissionReplacementWithPluralRules() { + UnicodeString rules("%cardinal:\n" + "-x: minus >>;\n" + "x.x: << point >>;\n" + "Inf: infinite;\n" + "NaN: not a number;\n" + "zero; one; two; three; four; five; six; seven; eight; nine;\n" + "ten; eleven; twelve; thirteen; fourteen; fifteen; sixteen; seventeen; eighteen; nineteen;\n" + "20: twenty[->>];\n" + "30: thirty[->>];\n" + "40: forty[->>];\n" + "50: fifty[->>];\n" + "60: sixty[->>];\n" + "70: seventy[->>];\n" + "80: eighty[->>];\n" + "90: ninety[->>];\n" + "100: << hundred[ >>];\n" + "1000: << thousand[ >>];\n" + "1000000: << million[ >>];\n" + "1000000000: << billion[ >>];\n" + "1000000000000: << trillion[ >>];\n" + "1000000000000000: =#,##0=;\n" + "%ordinal:\n" + "-x: minus >>;\n" + "x.x: =#,##0.#=;\n" + "Inf: infinitieth;\n" + "zeroth; first; second; third; fourth; fifth; sixth; seventh; eighth; ninth;\n" + "tenth; eleventh; twelfth;\n" + "13: =%cardinal=th;\n" + "20: twent[y->>|ieth];\n" + "30: thirt[y->>|ieth];\n" + "40: fort[y->>|ieth];\n" + "50: fift[y->>|ieth];\n" + "60: sixt[y->>|ieth];\n" + "70: sevent[y->>|ieth];\n" + "80: eight[y->>|ieth];\n" + "90: ninet[y->>|ieth];\n" + "100: <%cardinal< [$(cardinal,one{hundred}other{hundreds})$ >>|$(cardinal,one{hundredth}other{hundredths})$];\n" + "1000: <%cardinal< [$(cardinal,one{thousand}other{thousands})$ >>|$(cardinal,one{thousandth}other{thousandths})$];\n" + "1000000: <%cardinal< [$(cardinal,one{million}other{millions})$ >>|$(cardinal,one{millionth}other{millionths})$];\n" + "1000000000: <%cardinal< [$(cardinal,one{billion}other{billions})$ >>|$(cardinal,one{billionth}other{billionths})$];\n" + "1000000000000: <%cardinal< [$(cardinal,one{trillion}other{trillions})$ >>|$(cardinal,one{trillionth}other{trillionths})$];\n" + "1000000000000000: =#,##0=$(ordinal,one{st}two{nd}few{rd}other{th})$;"); + UErrorCode status = U_ZERO_ERROR; + UParseError perror; + icu::RuleBasedNumberFormat rbnf(rules, icu::Locale::getEnglish(), perror, status); + + const char * const enTestFullData[][2] = { + {"20", "twentieth"}, + {"21", "twenty-first"}, + {"29", "twenty-ninth"}, + {"30", "thirtieth"}, + {"31", "thirty-first"}, + {"39", "thirty-ninth"}, + {"100", "one hundredth"}, + {"101", "one hundred first"}, + {"200", "two hundredths"}, + {"201", "two hundreds first"}, + {"300", "three hundredths"}, + {"301", "three hundreds first"}, + {"1000", "one thousandth"}, + {"1001", "one thousand first"}, + {"1100", "one thousand one hundredth"}, + {"1101", "one thousand one hundred first"}, + {"1200", "one thousand two hundredths"}, + {"1201", "one thousand two hundreds first"}, + {"2000", "two thousandths"}, + {"2001", "two thousands first"}, + {"2100", "two thousands one hundredth"}, + {"2101", "two thousands one hundred first"}, + {"8000", "eight thousandths"}, + {"8001", "eight thousands first"}, + {"888000", "eight hundred eighty-eight thousandths"}, + {"888001", "eight hundred eighty-eight thousands first"}, + {"888100", "eight hundred eighty-eight thousands one hundredth"}, + {"999101", "nine hundred ninety-nine thousands one hundred first"}, + {"999200", "nine hundred ninety-nine thousands two hundredths"}, + {"999201", "nine hundred ninety-nine thousands two hundreds first"}, + { nullptr, nullptr } + }; + doTest(&rbnf, enTestFullData, false); +} + /* U_HAVE_RBNF */ #else diff --git a/icu4c/source/test/intltest/itrbnf.h b/icu4c/source/test/intltest/itrbnf.h index 3dfdc369ce14..377e49b4d4f3 100644 --- a/icu4c/source/test/intltest/itrbnf.h +++ b/icu4c/source/test/intltest/itrbnf.h @@ -19,139 +19,139 @@ class IntlTestRBNF : public IntlTest { - public: +public: - // IntlTest override - virtual void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par) override; + // IntlTest override + virtual void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par) override; #if U_HAVE_RBNF - /** - * Perform an API test - */ - virtual void TestAPI(); + /** + * Perform an API test + */ + virtual void TestAPI(); - void TestMultiplePluralRules(); + void TestMultiplePluralRules(); - /** - * Perform a simple spot check on the FractionalRuleSet logic - */ - virtual void TestFractionalRuleSet(); + /** + * Perform a simple spot check on the FractionalRuleSet logic + */ + virtual void TestFractionalRuleSet(); #if 0 - /** - * Perform API tests on llong - */ - virtual void TestLLong(); - virtual void TestLLongConstructors(); - virtual void TestLLongSimpleOperators(); + /** + * Perform API tests on llong + */ + virtual void TestLLong(); + virtual void TestLLongConstructors(); + virtual void TestLLongSimpleOperators(); #endif - /** - * Perform a simple spot check on the English spellout rules - */ - void TestEnglishSpellout(); - - /** - * Perform a simple spot check on the English ordinal-abbreviation rules - */ - void TestOrdinalAbbreviations(); - - /** - * Perform a simple spot check on the duration-formatting rules - */ - void TestDurations(); - - /** - * Test that rounding works correctly on multiplier substitutions that use - * a DecimalFormat. - */ - void TestDFRounding(); - - /** - * Perform a simple spot check on the Spanish spellout rules - */ - void TestSpanishSpellout(); - - /** - * Perform a simple spot check on the French spellout rules - */ - void TestFrenchSpellout(); - - /** - * Perform a simple spot check on the Swiss French spellout rules - */ - void TestSwissFrenchSpellout(); - - /** - * Check that Belgian French matches Swiss French spellout rules - */ - void TestBelgianFrenchSpellout(); - - /** - * Perform a simple spot check on the Italian spellout rules - */ - void TestItalianSpellout(); - - /** - * Perform a simple spot check on the Portuguese spellout rules - */ - void TestPortugueseSpellout(); - - /** - * Perform a simple spot check on the German spellout rules - */ - void TestGermanSpellout(); - - /** - * Perform a simple spot check on the Thai spellout rules - */ - void TestThaiSpellout(); - - /** - * Perform a simple spot check on the Norwegian (no,nb) spellout rules - */ - void TestNorwegianSpellout(); - - /** - * Perform a simple spot check on the Swedish spellout rules - */ - void TestSwedishSpellout(); - - /** - * Perform a simple spot check on small values - */ - void TestSmallValues(); - - /** - * Test localizations using string data. - */ - void TestLocalizations(); - - /** - * Test that all locales construct ok. - */ - void TestAllLocales(); - - /** - * Test that hebrew fractions format without trailing '<' - */ - void TestHebrewFraction(); - - /** - * Regression test, don't truncate - * when doing multiplier substitution to a number format rule. - */ - void TestMultiplierSubstitution(); - - /** - * Test the setDecimalFormatSymbols in RBNF - */ - void TestSetDecimalFormatSymbols(); - - /** - * Test the plural rules in RBNF - */ - void TestPluralRules(); + /** + * Perform a simple spot check on the English spellout rules + */ + void TestEnglishSpellout(); + + /** + * Perform a simple spot check on the English ordinal-abbreviation rules + */ + void TestOrdinalAbbreviations(); + + /** + * Perform a simple spot check on the duration-formatting rules + */ + void TestDurations(); + + /** + * Test that rounding works correctly on multiplier substitutions that use + * a DecimalFormat. + */ + void TestDFRounding(); + + /** + * Perform a simple spot check on the Spanish spellout rules + */ + void TestSpanishSpellout(); + + /** + * Perform a simple spot check on the French spellout rules + */ + void TestFrenchSpellout(); + + /** + * Perform a simple spot check on the Swiss French spellout rules + */ + void TestSwissFrenchSpellout(); + + /** + * Check that Belgian French matches Swiss French spellout rules + */ + void TestBelgianFrenchSpellout(); + + /** + * Perform a simple spot check on the Italian spellout rules + */ + void TestItalianSpellout(); + + /** + * Perform a simple spot check on the Portuguese spellout rules + */ + void TestPortugueseSpellout(); + + /** + * Perform a simple spot check on the German spellout rules + */ + void TestGermanSpellout(); + + /** + * Perform a simple spot check on the Thai spellout rules + */ + void TestThaiSpellout(); + + /** + * Perform a simple spot check on the Norwegian (no,nb) spellout rules + */ + void TestNorwegianSpellout(); + + /** + * Perform a simple spot check on the Swedish spellout rules + */ + void TestSwedishSpellout(); + + /** + * Perform a simple spot check on small values + */ + void TestSmallValues(); + + /** + * Test localizations using string data. + */ + void TestLocalizations(); + + /** + * Test that all locales construct ok. + */ + void TestAllLocales(); + + /** + * Test that hebrew fractions format without trailing '<' + */ + void TestHebrewFraction(); + + /** + * Regression test, don't truncate + * when doing multiplier substitution to a number format rule. + */ + void TestMultiplierSubstitution(); + + /** + * Test the setDecimalFormatSymbols in RBNF + */ + void TestSetDecimalFormatSymbols(); + + /** + * Test the plural rules in RBNF + */ + void TestPluralRules(); void TestInfinityNaN(); void TestVariableDecimalPoint(); @@ -162,17 +162,18 @@ class IntlTestRBNF : public IntlTest { void TestMinMaxIntegerDigitsIgnored(); void TestNumberingSystem(); void TestMemoryLeak22899(); - void TestInfiniteRecursion(); void TestParseRuleDescriptorOverflow23002(); + void TestInfiniteRecursion(); + void testOmissionReplacementWithPluralRules(); protected: - virtual void doTest(RuleBasedNumberFormat* formatter, const char* const testData[][2], UBool testParsing); - virtual void doLenientParseTest(RuleBasedNumberFormat* formatter, const char* testData[][2]); + virtual void doTest(RuleBasedNumberFormat* formatter, const char* const testData[][2], UBool testParsing); + virtual void doLenientParseTest(RuleBasedNumberFormat* formatter, const char* testData[][2]); /* U_HAVE_RBNF */ #else - virtual void TestRBNFDisabled(); + virtual void TestRBNFDisabled(); /* U_HAVE_RBNF */ #endif diff --git a/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/format/RbnfTest.java b/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/format/RbnfTest.java index 551a4dea0993..f7d96b3acc0e 100644 --- a/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/format/RbnfTest.java +++ b/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/format/RbnfTest.java @@ -1951,4 +1951,89 @@ public void TestInfiniteRecursion() { } } } + + /** + * This test is a little contrived for English, but the grammar is relevant for several languages, including: + * Latin, Germanic, Slavic and Indic. + */ + @Test + public void testOmissionReplacementWithPluralRules() { + final String rules = "%cardinal:\n" + + "-x: minus >>;\n" + + "x.x: << point >>;\n" + + "Inf: infinite;\n" + + "NaN: not a number;\n" + + "zero; one; two; three; four; five; six; seven; eight; nine;\n" + + "ten; eleven; twelve; thirteen; fourteen; fifteen; sixteen; seventeen; eighteen; nineteen;\n" + + "20: twenty[->>];\n" + + "30: thirty[->>];\n" + + "40: forty[->>];\n" + + "50: fifty[->>];\n" + + "60: sixty[->>];\n" + + "70: seventy[->>];\n" + + "80: eighty[->>];\n" + + "90: ninety[->>];\n" + + "100: << hundred[ >>];\n" + + "1000: << thousand[ >>];\n" + + "1000000: << million[ >>];\n" + + "1000000000: << billion[ >>];\n" + + "1000000000000: << trillion[ >>];\n" + + "1000000000000000: =#,##0=;\n" + + "%ordinal:\n" + + "-x: minus >>;\n" + + "x.x: =#,##0.#=;\n" + + "Inf: infinitieth;\n" + + "zeroth; first; second; third; fourth; fifth; sixth; seventh; eighth; ninth;\n" + + "tenth; eleventh; twelfth;\n" + + "13: =%cardinal=th;\n" + + "20: twent[y->>|ieth];\n" + + "30: thirt[y->>|ieth];\n" + + "40: fort[y->>|ieth];\n" + + "50: fift[y->>|ieth];\n" + + "60: sixt[y->>|ieth];\n" + + "70: sevent[y->>|ieth];\n" + + "80: eight[y->>|ieth];\n" + + "90: ninet[y->>|ieth];\n" + + "100: <%cardinal< [$(cardinal,one{hundred}other{hundreds})$ >>|$(cardinal,one{hundredth}other{hundredths})$];\n" + + "1000: <%cardinal< [$(cardinal,one{thousand}other{thousands})$ >>|$(cardinal,one{thousandth}other{thousandths})$];\n" + + "1000000: <%cardinal< [$(cardinal,one{million}other{millions})$ >>|$(cardinal,one{millionth}other{millionths})$];\n" + + "1000000000: <%cardinal< [$(cardinal,one{billion}other{billions})$ >>|$(cardinal,one{billionth}other{billionths})$];\n" + + "1000000000000: <%cardinal< [$(cardinal,one{trillion}other{trillions})$ >>|$(cardinal,one{trillionth}other{trillionths})$];\n" + + "1000000000000000: =#,##0=$(ordinal,one{st}two{nd}few{rd}other{th})$;"; + RuleBasedNumberFormat rbnf = new RuleBasedNumberFormat(rules, ULocale.US); + + String[][] enTestFullData = { + {"20", "twentieth"}, + {"21", "twenty-first"}, + {"29", "twenty-ninth"}, + {"30", "thirtieth"}, + {"31", "thirty-first"}, + {"39", "thirty-ninth"}, + {"100", "one hundredth"}, + {"101", "one hundred first"}, + {"200", "two hundredths"}, + {"201", "two hundreds first"}, + {"300", "three hundredths"}, + {"301", "three hundreds first"}, + {"1000", "one thousandth"}, + {"1001", "one thousand first"}, + {"1100", "one thousand one hundredth"}, + {"1101", "one thousand one hundred first"}, + {"1200", "one thousand two hundredths"}, + {"1201", "one thousand two hundreds first"}, + {"2000", "two thousandths"}, + {"2001", "two thousands first"}, + {"2100", "two thousands one hundredth"}, + {"2101", "two thousands one hundred first"}, + {"8000", "eight thousandths"}, + {"8001", "eight thousands first"}, + {"888000", "eight hundred eighty-eight thousandths"}, + {"888001", "eight hundred eighty-eight thousands first"}, + {"888100", "eight hundred eighty-eight thousands one hundredth"}, + {"999101", "nine hundred ninety-nine thousands one hundred first"}, + {"999200", "nine hundred ninety-nine thousands two hundredths"}, + {"999201", "nine hundred ninety-nine thousands two hundreds first"}, + }; + doTest(rbnf, enTestFullData, false); + } } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/NFRule.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/NFRule.java index bd32b5c92a6f..4c7d270a2832 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/NFRule.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/NFRule.java @@ -162,6 +162,7 @@ public static void makeRules(String description, // then it's really shorthand for two rules (with one exception) NFRule rule2 = null; StringBuilder sbuf = new StringBuilder(); + int orElseOp = description.indexOf('|'); // we'll actually only split the rule into two rules if its // base value is an even multiple of its divisor (or it's one @@ -205,9 +206,12 @@ else if (rule1.baseValue == DEFAULT_RULE) { // rule2's rule text omits the stuff in brackets: initialize // its rule text and substitutions accordingly - sbuf.append(description.substring(0, brack1)); + sbuf.append(description, 0, brack1); + if (orElseOp >= 0) { + sbuf.append(description, orElseOp + 1, brack2); + } if (brack2 + 1 < description.length()) { - sbuf.append(description.substring(brack2 + 1)); + sbuf.append(description, brack2 + 1, description.length()); } rule2.extractSubstitutions(owner, sbuf.toString(), predecessor); } @@ -216,8 +220,13 @@ else if (rule1.baseValue == DEFAULT_RULE) { // the brackets themselves: initialize _its_ rule text and // substitutions accordingly sbuf.setLength(0); - sbuf.append(description.substring(0, brack1)); - sbuf.append(description.substring(brack1 + 1, brack2)); + sbuf.append(description, 0, brack1); + if (orElseOp >= 0) { + sbuf.append(description, brack1 + 1, orElseOp); + } + else { + sbuf.append(description, brack1 + 1, brack2); + } if (brack2 + 1 < description.length()) { sbuf.append(description.substring(brack2 + 1)); } @@ -394,7 +403,7 @@ else if (descriptor.equals("Inf")) { // finally, if the rule body begins with an apostrophe, strip it off // (this is generally used to put whitespace at the beginning of // a rule's rule text) - if (description.length() > 0 && description.charAt(0) == '\'') { + if (!description.isEmpty() && description.charAt(0) == '\'') { description = description.substring(1); } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/NFRuleSet.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/NFRuleSet.java index 7a21a6f01f58..4b7d6812d8e5 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/NFRuleSet.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/NFRuleSet.java @@ -105,7 +105,7 @@ public NFRuleSet(RuleBasedNumberFormat owner, String[] descriptions, int index) this.owner = owner; String description = descriptions[index]; - if (description.length() == 0) { + if (description.isEmpty()) { throw new IllegalArgumentException("Empty rule set description"); } @@ -119,12 +119,12 @@ public NFRuleSet(RuleBasedNumberFormat owner, String[] descriptions, int index) throw new IllegalArgumentException("Rule set name doesn't end in colon"); } else { - String name = description.substring(0, pos); - this.isParseable = !name.endsWith("@noparse"); + String ruleName = description.substring(0, pos); + this.isParseable = !ruleName.endsWith("@noparse"); if (!this.isParseable) { - name = name.substring(0,name.length()-8); // Remove the @noparse from the name + ruleName = ruleName.substring(0, ruleName.length() - 8); // Remove the @noparse from the name } - this.name = name; + this.name = ruleName; //noinspection StatementWithEmptyBody while (pos < description.length() && PatternProps.isWhiteSpace(description.charAt(++pos))) { @@ -140,7 +140,7 @@ public NFRuleSet(RuleBasedNumberFormat owner, String[] descriptions, int index) isParseable = true; } - if (description.length() == 0) { + if (description.isEmpty()) { throw new IllegalArgumentException("Empty rule set description"); } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedNumberFormat.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedNumberFormat.java index 60f2f30b3999..c1202b5c5bc1 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedNumberFormat.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedNumberFormat.java @@ -32,11 +32,11 @@ /** - *

      A class that formats numbers according to a set of rules. This number formatter is + * The RuleBasedNumberFormat class formats numbers according to a set of rules. This number formatter is * typically used for spelling out numeric values in words (e.g., 25,3476 as * "twenty-five thousand three hundred seventy-six" or "vingt-cinq mille trois * cents soixante-seize" or - * "funfundzwanzigtausenddreihundertsechsundsiebzig"), but can also be used for + * "fünfundzwanzigtausenddreihundertsechsundsiebzig"), but can also be used for * other complicated formatting tasks, such as formatting a number of seconds as hours, * minutes and seconds (e.g., 3,730 as "1:02:10").

      * @@ -44,11 +44,11 @@ * spells out a value in words (123 is "one hundred twenty-three"); ordinal, which * appends an ordinal suffix to the end of a numeral (123 is "123rd"); and * duration, which shows a duration in seconds as hours, minutes, and seconds (123 is - * "2:03").  The client can also define more specialized RuleBasedNumberFormats + * "2:03").  The client can also define more specialized RuleBasedNumberFormats * by supplying programmer-defined rule sets.

      * - *

      The behavior of a RuleBasedNumberFormat is specified by a textual description - * that is either passed to the constructor as a String or loaded from a resource + *

      The behavior of a RuleBasedNumberFormat is specified by a textual description + * that is either passed to the constructor as a String or loaded from a resource * bundle. In its simplest form, the description consists of a semicolon-delimited list of rules. * Each rule has a string of output text and a value or range of values it is applicable to. * In a typical spellout rule set, the first twenty rules are the words for the numbers from @@ -60,8 +60,9 @@ *

      For larger numbers, we can use the preceding set of rules to format the ones place, and * we only have to supply the words for the multiples of 10:

      * - *
      20: twenty[->>];
      - * 30: thirty{->>];
      + * 
      + * 20: twenty[->>];
      + * 30: thirty[->>];
        * 40: forty[->>];
        * 50: fifty[->>];
        * 60: sixty[->>];
      @@ -81,7 +82,8 @@
        * 

      For even larger numbers, we can actually look up several parts of the number in the * list:

      * - *
      100: << hundred[ >>];
      + *
      + * 100: << hundred[ >>];
      * *

      The "<<" represents a new kind of substitution. The << isolates * the hundreds digit (and any digits to its left), formats it using this same rule set, and @@ -99,13 +101,15 @@ * *

      This rule covers values up to 999, at which point we add another rule:

      * - *
      1000: << thousand[ >>];
      + *
      + * 1000: << thousand[ >>];
      * *

      Again, the meanings of the brackets and substitution tokens shift because the rule's * base value is a higher power of 10, changing the rule's divisor. This rule can actually be * used all the way up to 999,999. This allows us to finish out the rules as follows:

      * - *
      1,000,000: << million[ >>];
      + * 
      + * 1,000,000: << million[ >>];
        * 1,000,000,000: << billion[ >>];
        * 1,000,000,000,000: << trillion[ >>];
        * 1,000,000,000,000,000: OUT OF RANGE!;
      @@ -121,34 +125,28 @@ *

      To see how these rules actually work in practice, consider the following example: * Formatting 25,430 with this rule set would work like this:

      * - * + *
      * - * * * * * - * * * * * - * * * * * - * * * * * - * * * * * - * * * @@ -187,20 +185,20 @@ * *
      * - *

      The description of a RuleBasedNumberFormat's behavior consists of one or more rule + *

      The description of a RuleBasedNumberFormat's behavior consists of one or more rule * sets. Each rule set consists of a name, a colon, and a list of rules. A rule * set name must begin with a % sign. Rule sets with names that begin with a single % sign * are public: the caller can specify that they be used to format and parse numbers. * Rule sets with names that begin with %% are private: they exist only for the use * of other rule sets. If a formatter only has one rule set, the name may be omitted.

      * - *

      The user can also specify a special "rule set" named %%lenient-parse. - * The body of %%lenient-parse isn't a set of number-formatting rules, but a RuleBasedCollator + *

      The user can also specify a special "rule set" named %%lenient-parse. + * The body of %%lenient-parse isn't a set of number-formatting rules, but a RuleBasedCollator * description which is used to define equivalences for lenient parsing. For more information - * on the syntax, see RuleBasedCollator. For more information on lenient parsing, - * see setLenientParse(). Note: symbols that have syntactic meaning + * on the syntax, see RuleBasedCollator. For more information on lenient parsing, + * see setLenientParse(). Note: symbols that have syntactic meaning * in collation rules, such as '&', have no particular meaning when appearing outside - * of the lenient-parse rule set.

      + * of the lenient-parse rule set.

      * *

      The body of a rule set consists of an ordered, semicolon-delimited list of rules. * Internally, every rule has a base value, a divisor, rule text, and zero, one, or two substitutions. @@ -210,48 +208,46 @@ *

      A rule descriptor can take one of the following forms (text in italics is the * name of a token):

      * - *
      << thousand >>[the rule whose base value is 1,000 is applicable to 25,340]
      twenty->> thousand >>[25,340 over 1,000 is 25. The rule for 20 applies.]
      twenty-five thousand >>[25 mod 10 is 5. The rule for 5 is "five."
      twenty-five thousand << hundred >>[25,340 mod 1,000 is 340. The rule for 100 applies.]
      twenty-five thousand three hundred >>[340 over 100 is 3. The rule for 3 is "three."]
      twenty-five thousand three hundred forty[340 mod 100 is 40. The rule for 40 applies. Since 40 divides * evenly by 10, the hyphen and substitution in the brackets are omitted.]
      + *
      * - * - * - * + * + * + * + * + * * - * - * - * - * + * + * * - * - * - * - * + * + * * - * - * - * - * + * + * * - * - * - * - * + * + * + * * - * - * - * - * + * + * * - * - * - * - * + * + * * - * - * - * - * + * + * * - * - * - * + * + * * * - * - * - * + * + * * * - * - * - * + * + * *
      bv:bv specifies the rule's base value. bv is a decimal + * DescriptorDescription
      bv:bv specifies the rule's base value. bv is a decimal * number expressed using ASCII digits. bv may contain spaces, period, and commas, * which are ignored. The rule's divisor is the highest power of 10 less than or equal to * the base value.
      bv/rad:bv specifies the rule's base value. The rule's divisor is the + *
      bv/rad:bv specifies the rule's base value. The rule's divisor is the * highest power of rad less than or equal to the base value.
      bv>:bv specifies the rule's base value. To calculate the divisor, + *
      bv>:bv specifies the rule's base value. To calculate the divisor, * let the radix be 10, and the exponent be the highest exponent of the radix that yields a * result less than or equal to the base value. Every > character after the base value * decreases the exponent by 1. If the exponent is positive or 0, the divisor is the radix * raised to the power of the exponent; otherwise, the divisor is 1.
      bv/rad>:bv specifies the rule's base value. To calculate the divisor, + *
      bv/rad>:bv specifies the rule's base value. To calculate the divisor, * let the radix be rad, and the exponent be the highest exponent of the radix that * yields a result less than or equal to the base value. Every > character after the radix * decreases the exponent by 1. If the exponent is positive or 0, the divisor is the radix * raised to the power of the exponent; otherwise, the divisor is 1.
      -x:The rule is a negative-number rule.
      -x:The rule is a negative-number rule.
      x.x:The rule is an improper fraction rule. If the full stop in + *
      x.x:The rule is an improper fraction rule. If the full stop in * the middle of the rule name is replaced with the decimal point * that is used in the language or DecimalFormatSymbols, then that rule will * have precedence when formatting and parsing this rule. For example, some @@ -260,10 +256,9 @@ * handle the decimal point that matches the language's natural spelling of * the punctuation of either the full stop or comma.
      0.x:The rule is a proper fraction rule. If the full stop in + *
      0.x:The rule is a proper fraction rule. If the full stop in * the middle of the rule name is replaced with the decimal point * that is used in the language or DecimalFormatSymbols, then that rule will * have precedence when formatting and parsing this rule. For example, some @@ -272,10 +267,9 @@ * handle the decimal point that matches the language's natural spelling of * the punctuation of either the full stop or comma
      x.0:The rule is a default rule. If the full stop in + *
      x.0:The rule is a default rule. If the full stop in * the middle of the rule name is replaced with the decimal point * that is used in the language or DecimalFormatSymbols, then that rule will * have precedence when formatting and parsing this rule. For example, some @@ -284,19 +278,16 @@ * handle the decimal point that matches the language's natural spelling of * the punctuation of either the full stop or comma
      Inf:
      Inf:The rule for infinity.
      NaN:
      NaN:The rule for an IEEE 754 NaN (not a number).
      nothing
      nothingIf the rule's rule descriptor is left out, the base value is one plus the * preceding rule's base value (or zero if this is the first rule in the list) in a normal * rule set.  In a fraction rule set, the base value is the same as the preceding rule's @@ -313,8 +304,8 @@ * algorithms: If the rule set is a regular rule set, do the following: * *
        - *
      • If the rule set includes a default rule (and the number was passed in as a double), - * use the default rule.  (If the number being formatted was passed in as a long, + *
      • If the rule set includes a default rule (and the number was passed in as a double), + * use the default rule.  (If the number being formatted was passed in as a long, * the default rule is ignored.)
      • *
      • If the number is negative, use the negative-number rule.
      • *
      • If the number has a fractional part and is greater than 1, use the improper fraction @@ -361,49 +352,43 @@ * *

        The meanings of the substitution token characters are as follows:

        * - * + *
        * - * - * - * + * + * + * + * + * + * + * * * * - * - * - * + * * * * - * - * - * + * * * * - * - * - * + * * * - * - * - * - * + * + * + * * * * - * - * - * + * * * - * - * - * - * + * + * + * * * * - * - * - * + * * * * - * - * - * + * * * * - * - * - * + * * * - * - * - * - * + * + * + * * * - * - * - * - * - * + * + * + * + * * * - * - * - * + * * * * - * - * - * - * + * + * * * - * - * - * - * + * + * * * - * - * - * + * * * * - * - * - * - * + * + * * - * - * - * - * + * + * + * * * - * - * - * - * + * + * + * * * *
        >>in normal ruleSyntaxUsageDescription
        >>in normal ruleDivide the number by the rule's divisor and format the remainder
        in negative-number rulein negative-number ruleFind the absolute value of the number and format the result
        in fraction or default rulein fraction or default ruleIsolate the number's fractional part and format it.
        in rule in fraction rule setin rule in fraction rule setNot allowed.
        >>>in normal rule
        >>>in normal ruleDivide the number by the rule's divisor and format the remainder, * but bypass the normal rule-selection process and just use the * rule that precedes this one in this rule list.
        in all other rulesin all other rulesNot allowed.
        <<in normal rule
        <<in normal ruleDivide the number by the rule's divisor, perform floor() on the quotient, * and format the resulting value.
        * If there is a DecimalFormat pattern between the < characters and the @@ -416,83 +401,67 @@ *
        in negative-number rulein negative-number ruleNot allowed.
        in fraction or default rulein fraction or default ruleIsolate the number's integral part and format it.
        in rule in fraction rule setin rule in fraction rule setMultiply the number by the rule's base value and format the result.
        ==in all rule sets
        ==in all rule setsFormat the number unchanged
        []in normal ruleOmit the optional text if the number is an even multiple of the rule's divisor
        []in normal ruleOmit the optional text and rules if the number is an even multiple of the rule's divisor. + * If the | symbol is present, the text and rules after the symbol and before the right bracket will be used instead of omission.
        in negative-number rulein negative-number ruleNot allowed.
        in improper-fraction ruleOmit the optional text if the number is between 0 and 1 (same as specifying both an - * x.x rule and a 0.x rule)in improper-fraction ruleOmit the optional text and rules if the number is between 0 and 1 (same as specifying both an + * x.x rule and a 0.x rule). + * If the | symbol is present, the text and rules after the symbol and before the right bracket will be used instead of omission.
        in default ruleOmit the optional text if the number is an integer (same as specifying both an x.x - * rule and an x.0 rule)in default ruleOmit the optional text and rules if the number is an integer (same as specifying both an x.x + * rule and an x.0 rule). + * If the | symbol is present, the text and rules after the symbol and before the right bracket will be used instead of omission.
        in proper-fraction rulein proper-fraction ruleNot allowed.
        in rule in fraction rule setOmit the optional text if multiplying the number by the rule's base value yields 1.in rule in fraction rule setOmit the optional text and rules if multiplying the number by the rule's base value yields 1. + * If the | symbol is present, the text and rules after the symbol and before the right bracket will be used instead of omission.
        $(cardinal,plural syntax)$in all rule sets
        $(cardinal,plural syntax)$in all rule setsThis provides the ability to choose a word based on the number divided by the radix to the power of the * exponent of the base value for the specified locale, which is normally equivalent to the << value. - * This uses the cardinal plural rules from PluralFormat. All strings used in the plural format are treated + * This uses the cardinal plural rules from {@link PluralFormat}. All strings used in the plural format are treated * as the same base value for parsing.
        $(ordinal,plural syntax)$in all rule sets
        $(ordinal,plural syntax)$in all rule setsThis provides the ability to choose a word based on the number divided by the radix to the power of the * exponent of the base value for the specified locale, which is normally equivalent to the << value. - * This uses the ordinal plural rules from PluralFormat. All strings used in the plural format are treated + * This uses the ordinal plural rules from {@link PluralFormat}. All strings used in the plural format are treated * as the same base value for parsing.
        @@ -500,22 +469,23 @@ *

        The substitution descriptor (i.e., the text between the token characters) may take one * of three forms:

        * - * + *
        * - * - * + * + * + * + * + * * * - * - * - * + * + * * * - * - * - * + * + * *
        a rule set nameDescriptorDescription
        a rule set namePerform the mathematical operation on the number, and format the result using the * named rule set.
        a DecimalFormat pattern
        a DecimalFormat patternPerform the mathematical operation on the number, and format the result using a * DecimalFormat with the specified pattern.  The pattern must begin with 0 or #.
        nothing
        nothingPerform the mathematical operation on the number, and format the result using the rule * set containing the current rule, except:
          *
        • You can't have an empty substitution descriptor with a == substitution.
        • @@ -553,7 +523,7 @@ public class RuleBasedNumberFormat extends NumberFormat { //----------------------------------------------------------------------- // Generated by serialver from JDK 1.4.1_01 - static final long serialVersionUID = -7664252765575395068L; + private static final long serialVersionUID = -7664252765575395068L; /** * Selector code that tells the constructor to create a spellout formatter @@ -1164,7 +1134,7 @@ public String format(double number, String ruleSet) throws IllegalArgumentExcept /** * Formats the specified number according to the specified rule set. * (If the specified rule set specifies a default ["x.0"] rule, this function - * ignores it. Convert the number to a double first if you ned it.) This + * ignores it. Convert the number to a double first if you need it.) This * function preserves all the precision in the long-- it doesn't convert it * to a double. * @param number The number to format. @@ -1210,7 +1180,7 @@ public StringBuffer format(double number, * Formats the specified number using the formatter's default rule set. * (The default rule set is the last public rule set defined in the description.) * (If the specified rule set specifies a default ["x.0"] rule, this function - * ignores it. Convert the number to a double first if you ned it.) This + * ignores it. Convert the number to a double first if you need it.) This * function preserves all the precision in the long-- it doesn't convert it * to a double. * @param number The number to format. @@ -1305,7 +1275,7 @@ public StringBuffer format(com.ibm.icu.math.BigDecimal number, public Number parse(String text, ParsePosition parsePosition) { // parsePosition tells us where to start parsing. We copy the - // text in the string from here to the end inro a new string, + // text in the string from here to the end into a new string, // and create a new ParsePosition and result variable to use // for the duration of the parse operation String workingText = text.substring(parsePosition.getIndex()); @@ -1334,10 +1304,9 @@ public Number parse(String text, ParsePosition parsePosition) { result = tempResult; highWaterMark.setIndex(workingPos.getIndex()); } - // commented out because this API on ParsePosition doesn't exist in 1.1.x - // if (workingPos.getErrorIndex() > highWaterMark.getErrorIndex()) { - // highWaterMark.setErrorIndex(workingPos.getErrorIndex()); - // } + if (workingPos.getErrorIndex() > highWaterMark.getErrorIndex()) { + highWaterMark.setErrorIndex(workingPos.getErrorIndex()); + } // if we manage to use up all the characters in the string, // we don't have to try any more rule sets @@ -1350,13 +1319,12 @@ public Number parse(String text, ParsePosition parsePosition) { workingPos.setIndex(0); } - // add the high water mark to our original parse position and + // add the high watermark to our original parse position and // return the result parsePosition.setIndex(parsePosition.getIndex() + highWaterMark.getIndex()); - // commented out because this API on ParsePosition doesn't exist in 1.1.x - // if (highWaterMark.getIndex() == 0) { - // parsePosition.setErrorIndex(parsePosition.getIndex() + highWaterMark.getErrorIndex()); - // } + if (highWaterMark.getIndex() == 0) { + parsePosition.setErrorIndex(parsePosition.getIndex() + highWaterMark.getErrorIndex()); + } return result; } @@ -1668,7 +1636,7 @@ NFRule getDefaultNaNRule() { * @param specialName the name of the special rule text to extract * @return the special rule text, or null if the rule was not found */ - private String extractSpecial(StringBuilder description, String specialName) { + private static String extractSpecial(StringBuilder description, String specialName) { String result = null; int lp = description.indexOf(specialName); if (lp != -1) { @@ -1701,7 +1669,7 @@ private String extractSpecial(StringBuilder description, String specialName) { } /** - * This function parses the description and uses it to build all of + * This function parses the description and uses it to build all of the * internal data structures that the formatter uses to do formatting * @param description The description of the formatter's desired behavior. * This is either passed in by the caller or loaded out of a resource @@ -1747,7 +1715,7 @@ private void init(String description, String[][] localizations) { // divide up the descriptions into individual rule-set descriptions // and store them in a temporary array. At each step, we also - // new up a rule set, but all this does is initialize its name + // create a rule set, but all this does is initialize its name // and remove it from its description. We can't actually parse // the rest of the descriptions and finish initializing everything // because we have to know the names and locations of all the rule @@ -1806,8 +1774,8 @@ private void init(String description, String[][] localizations) { defaultRuleSet = ruleSets[ruleSets.length - 1]; } - // finally, we can go back through the temporary descriptions - // list and finish setting up the substructure + // Now that we know all the rule names, we can go back through + // the temporary descriptions list and finish setting up the substructure for (int i = 0; i < ruleSets.length; i++) { ruleSets[i].parseRules(ruleSetDescriptions[i]); } @@ -1902,40 +1870,34 @@ private StringBuilder stripWhitespace(String description) { // iterate through the characters... int start = 0; + char ch; while (start < descriptionLength) { - // seek to the first non-whitespace character... + // Seek to the first non-whitespace character... + // If the first non-whitespace character is semicolon, skip it and continue while (start < descriptionLength - && PatternProps.isWhiteSpace(description.charAt(start))) + && (PatternProps.isWhiteSpace(ch = description.charAt(start)) || ch == ';')) { ++start; } - //if the first non-whitespace character is semicolon, skip it and continue - if (start < descriptionLength && description.charAt(start) == ';') { - start += 1; - continue; - } - // locate the next semicolon in the text and copy the text from // our current position up to that semicolon into the result int p = description.indexOf(';', start); if (p == -1) { // or if we don't find a semicolon, just copy the rest of // the string into the result - result.append(description.substring(start)); + result.append(description, start, descriptionLength); break; } else if (p < descriptionLength) { - result.append(description.substring(start, p + 1)); - start = p + 1; - } - else { - // when we get here, we've seeked off the end of the string, and - // we terminate the loop (we continue until *start* is -1 rather - // than until *p* is -1, because otherwise we'd miss the last - // rule in the description) - break; + int end = p + 1; + result.append(description, start, end); + start = end; } + // when we get here from the else, we've seeked off the end of the string, and + // we terminate the loop (we continue until *start* is -1 rather + // than until *p* is -1, because otherwise we'd miss the last + // rule in the description) } return result; } diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/text/package.html b/icu4j/main/core/src/main/resources/com/ibm/icu/text/package.html index 96bef4a1f0b5..c249bcb56636 100644 --- a/icu4j/main/core/src/main/resources/com/ibm/icu/text/package.html +++ b/icu4j/main/core/src/main/resources/com/ibm/icu/text/package.html @@ -11,7 +11,7 @@ ICU4J com.ibm.icu.text Package Overview -Extensions and enhancements to java.text to support unicode transforms, UnicodeSet, surrogate char utilities, UCA collation, normalization, break iteration (rule and dictionary based), enhanced number format, international string searching, and arabic shaping.

          +

          Extensions and enhancements to java.text to support unicode transforms, UnicodeSet, surrogate char utilities, UCA collation, normalization, break iteration (rule and dictionary based), enhanced number format, international string searching, and arabic shaping.

          • Unicode Transforms (Transliteration) convert between different representations of unicode text.