# HG changeset patch # User Jonathan Kew # Date 1557765344 0 # Mon May 13 16:35:44 2019 +0000 # Node ID a811c910cfd3527d20556d09fca7dafc4003bc4d # Parent 5a20e917ee2d3ac8c01ff1be2ffde204631b9ad9 Bug 1550532 - Avoid auto-hyphenating capitalized words, except for German. r=emilio,mats This affects a number of our existing reftests, so we'll need to update those to not expect auto-hyphenation of a sentence-initial (capitalized) word. (Hyphenation behavior is not sufficiently well-specified for this to be tested at the WPT level, so we just use Gecko-specific reftests.) Differential Revision: https://phabricator.services.mozilla.com/D30912 diff --git a/intl/hyphenation/glue/nsHyphenationManager.cpp b/intl/hyphenation/glue/nsHyphenationManager.cpp --- a/intl/hyphenation/glue/nsHyphenationManager.cpp +++ b/intl/hyphenation/glue/nsHyphenationManager.cpp @@ -74,16 +74,19 @@ nsHyphenationManager::~nsHyphenationMana already_AddRefed nsHyphenationManager::GetHyphenator( nsIAtom* aLocale) { RefPtr hyph; mHyphenators.Get(aLocale, getter_AddRefs(hyph)); if (hyph) { return hyph.forget(); } + nsAutoCString hyphCapPref("intl.hyphenate-capitalized."); + hyphCapPref.Append(nsAtomCString(aLocale)); + bool hyphenateCapitalized = Preferences::GetBool(hyphCapPref.get()); nsCOMPtr uri = mPatternFiles.Get(aLocale); if (!uri) { nsCOMPtr alias = mHyphAliases.Get(aLocale); if (alias) { mHyphenators.Get(alias, getter_AddRefs(hyph)); if (hyph) { return hyph.forget(); } @@ -105,17 +108,17 @@ already_AddRefed nsHyphena localeStr.ReplaceLiteral(i, localeStr.Length() - i, "-*"); nsCOMPtr fuzzyLocale = NS_Atomize(localeStr); return GetHyphenator(fuzzyLocale); } else { return nullptr; } } } - hyph = new nsHyphenator(uri); + hyph = new nsHyphenator(uri, hyphenateCapitalized); if (hyph->IsValid()) { mHyphenators.Put(aLocale, hyph); return hyph.forget(); } #ifdef DEBUG nsCString msg("failed to load patterns from "); msg += uri->GetSpecOrDefault(); NS_WARNING(msg.get()); diff --git a/intl/hyphenation/glue/nsHyphenator.cpp b/intl/hyphenation/glue/nsHyphenator.cpp --- a/intl/hyphenation/glue/nsHyphenator.cpp +++ b/intl/hyphenation/glue/nsHyphenator.cpp @@ -6,17 +6,18 @@ #include "nsHyphenator.h" #include "nsIFile.h" #include "nsUTF8Utils.h" #include "nsUnicodeProperties.h" #include "nsIURI.h" #include "hyphen.h" -nsHyphenator::nsHyphenator(nsIURI* aURI) : mDict(nullptr) { +nsHyphenator::nsHyphenator(nsIURI* aURI, bool aHyphenateCapitalized) + : mDict(nullptr), mHyphenateCapitalized(aHyphenateCapitalized) { nsCString uriSpec; nsresult rv = aURI->GetSpec(uriSpec); if (NS_FAILED(rv)) { return; } mDict = hnj_hyphen_load(uriSpec.get()); #ifdef DEBUG if (mDict) { @@ -65,84 +66,100 @@ nsresult nsHyphenator::Hyphenate(const n } wordLimit = i + chLen; if (i + chLen < aString.Length()) { continue; } } if (inWord) { - // Convert the word to utf-8 for libhyphen, lowercasing it as we go - // so that it will match the (lowercased) patterns (bug 1105644). - nsAutoCString utf8; - const char16_t* const begin = aString.BeginReading(); - const char16_t* cur = begin + wordStart; - const char16_t* end = begin + wordLimit; - while (cur < end) { - uint32_t ch = *cur++; - - if (NS_IS_HIGH_SURROGATE(ch)) { - if (cur < end && NS_IS_LOW_SURROGATE(*cur)) { - ch = SURROGATE_TO_UCS4(ch, *cur++); - } else { - ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR - } - } else if (NS_IS_LOW_SURROGATE(ch)) { - ch = 0xfffd; // unpaired surrogate - } - - // XXX What about language-specific casing? Consider Turkish I/i... - // In practice, it looks like the current patterns will not be - // affected by this, as they treat dotted and undotted i similarly. - ch = ToLowerCase(ch); - - if (ch < 0x80) { // U+0000 - U+007F - utf8.Append(ch); - } else if (ch < 0x0800) { // U+0100 - U+07FF - utf8.Append(0xC0 | (ch >> 6)); - utf8.Append(0x80 | (0x003F & ch)); - } else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF - utf8.Append(0xE0 | (ch >> 12)); - utf8.Append(0x80 | (0x003F & (ch >> 6))); - utf8.Append(0x80 | (0x003F & ch)); - } else { - utf8.Append(0xF0 | (ch >> 18)); - utf8.Append(0x80 | (0x003F & (ch >> 12))); - utf8.Append(0x80 | (0x003F & (ch >> 6))); - utf8.Append(0x80 | (0x003F & ch)); - } - } - - AutoTArray utf8hyphens; - utf8hyphens.SetLength(utf8.Length() + 5); - char** rep = nullptr; - int* pos = nullptr; - int* cut = nullptr; - int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict, utf8.BeginReading(), - utf8.Length(), utf8hyphens.Elements(), - nullptr, &rep, &pos, &cut); - if (!err) { - // Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer - // from utf8 code unit indexing (which would match the utf8 input - // string directly) to Unicode character indexing. - // We then need to convert this to utf16 code unit offsets for Gecko. - const char* hyphPtr = utf8hyphens.Elements(); - const char16_t* cur = begin + wordStart; - const char16_t* end = begin + wordLimit; - while (cur < end) { - if (*hyphPtr & 0x01) { - aHyphens[cur - begin] = true; - } - cur++; - if (cur < end && NS_IS_LOW_SURROGATE(*cur) && - NS_IS_HIGH_SURROGATE(*(cur - 1))) { - cur++; - } - hyphPtr++; - } - } + HyphenateWord(aString, wordStart, wordLimit, aHyphens); + inWord = false; } - - inWord = false; } return NS_OK; } + +void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart, + uint32_t aLimit, nsTArray& aHyphens) { + // Convert word from aStart and aLimit in aString to utf-8 for libhyphen, + // lowercasing it as we go so that it will match the (lowercased) patterns + // (bug 1105644). + nsAutoCString utf8; + const char16_t* const begin = aString.BeginReading(); + const char16_t* cur = begin + aStart; + const char16_t* end = begin + aLimit; + bool firstLetter = true; + while (cur < end) { + uint32_t ch = *cur++; + + if (NS_IS_HIGH_SURROGATE(ch)) { + if (cur < end && NS_IS_LOW_SURROGATE(*cur)) { + ch = SURROGATE_TO_UCS4(ch, *cur++); + } else { + ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR + } + } else if (NS_IS_LOW_SURROGATE(ch)) { + ch = 0xfffd; // unpaired surrogate + } + + // XXX What about language-specific casing? Consider Turkish I/i... + // In practice, it looks like the current patterns will not be + // affected by this, as they treat dotted and undotted i similarly. + uint32_t origCh = ch; + ch = ToLowerCase(ch); + + // Avoid hyphenating capitalized words (bug 1550532) unless explicitly + // allowed by prefs for the language in use. + if (firstLetter) { + if (!mHyphenateCapitalized && ch != origCh) { + return; + } + firstLetter = false; + } + + if (ch < 0x80) { // U+0000 - U+007F + utf8.Append(ch); + } else if (ch < 0x0800) { // U+0100 - U+07FF + utf8.Append(0xC0 | (ch >> 6)); + utf8.Append(0x80 | (0x003F & ch)); + } else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF + utf8.Append(0xE0 | (ch >> 12)); + utf8.Append(0x80 | (0x003F & (ch >> 6))); + utf8.Append(0x80 | (0x003F & ch)); + } else { + utf8.Append(0xF0 | (ch >> 18)); + utf8.Append(0x80 | (0x003F & (ch >> 12))); + utf8.Append(0x80 | (0x003F & (ch >> 6))); + utf8.Append(0x80 | (0x003F & ch)); + } + } + + AutoTArray utf8hyphens; + utf8hyphens.SetLength(utf8.Length() + 5); + char** rep = nullptr; + int* pos = nullptr; + int* cut = nullptr; + int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict, utf8.BeginReading(), + utf8.Length(), utf8hyphens.Elements(), + nullptr, &rep, &pos, &cut); + if (!err) { + // Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer + // from utf8 code unit indexing (which would match the utf8 input + // string directly) to Unicode character indexing. + // We then need to convert this to utf16 code unit offsets for Gecko. + const char* hyphPtr = utf8hyphens.Elements(); + const char16_t* cur = begin + aStart; + const char16_t* end = begin + aLimit; + while (cur < end) { + if (*hyphPtr & 0x01) { + aHyphens[cur - begin] = true; + } + cur++; + if (cur < end && NS_IS_LOW_SURROGATE(*cur) && + NS_IS_HIGH_SURROGATE(*(cur - 1))) { + cur++; + } + hyphPtr++; + } + } +} diff --git a/intl/hyphenation/glue/nsHyphenator.h b/intl/hyphenation/glue/nsHyphenator.h --- a/intl/hyphenation/glue/nsHyphenator.h +++ b/intl/hyphenation/glue/nsHyphenator.h @@ -9,24 +9,27 @@ #include "nsCOMPtr.h" #include "nsString.h" #include "nsTArray.h" class nsIURI; class nsHyphenator { public: - explicit nsHyphenator(nsIURI* aURI); + nsHyphenator(nsIURI* aURI, bool aHyphenateCapitalized); NS_INLINE_DECL_REFCOUNTING(nsHyphenator) bool IsValid(); nsresult Hyphenate(const nsAString& aText, nsTArray& aHyphens); private: ~nsHyphenator(); - protected: + void HyphenateWord(const nsAString& aString, uint32_t aStart, + uint32_t aLimit, nsTArray& aHyphens); + void* mDict; + bool mHyphenateCapitalized; }; #endif // nsHyphenator_h__ diff --git a/layout/reftests/text/auto-hyphenation-af-1-ref.html b/layout/reftests/text/auto-hyphenation-af-1-ref.html --- a/layout/reftests/text/auto-hyphenation-af-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-af-1-ref.html @@ -1,11 +1,11 @@
-Al­le mens­li­ke we­sens word vry, met ge­ly­ke waar­dig­heid en reg­te, ge­bo­re. +Alle mens­li­ke we­sens word vry, met ge­ly­ke waar­dig­heid en reg­te, ge­bo­re.
diff --git a/layout/reftests/text/auto-hyphenation-bg-1-ref.html b/layout/reftests/text/auto-hyphenation-bg-1-ref.html --- a/layout/reftests/text/auto-hyphenation-bg-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-bg-1-ref.html @@ -1,11 +1,11 @@
-Всич­ки хо­ра се раж­дат сво­бод­ни и рав­ни по дос­тойн­с­т­во и пра­ва. +Всички хо­ра се раж­дат сво­бод­ни и рав­ни по дос­тойн­с­т­во и пра­ва.
diff --git a/layout/reftests/text/auto-hyphenation-cy-1-ref.html b/layout/reftests/text/auto-hyphenation-cy-1-ref.html --- a/layout/reftests/text/auto-hyphenation-cy-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-cy-1-ref.html @@ -1,11 +1,11 @@
-Gen­ir pawb yn rhydd ac yn gyd­radd â'i gil­ydd mewn urdd­as a hawl­iau. +Genir pawb yn rhydd ac yn gyd­radd â'i gil­ydd mewn urdd­as a hawl­iau.
diff --git a/layout/reftests/text/auto-hyphenation-da-1-ref.html b/layout/reftests/text/auto-hyphenation-da-1-ref.html --- a/layout/reftests/text/auto-hyphenation-da-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-da-1-ref.html @@ -1,11 +1,11 @@
-Al­le men­ne­sker er født frie og li­ge i vær­dig­hed og ret­tig­he­der. +Alle men­ne­sker er født frie og li­ge i vær­dig­hed og ret­tig­he­der.
diff --git a/layout/reftests/text/auto-hyphenation-es-1-ref.html b/layout/reftests/text/auto-hyphenation-es-1-ref.html --- a/layout/reftests/text/auto-hyphenation-es-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-es-1-ref.html @@ -1,11 +1,11 @@
-To­dos los se­res hu­ma­nos na­cen li­bres e igua­les en dig­ni­dad y de­re­chos +Todos los se­res hu­ma­nos na­cen li­bres e igua­les en dig­ni­dad y de­re­chos
diff --git a/layout/reftests/text/auto-hyphenation-fi-1-ref.html b/layout/reftests/text/auto-hyphenation-fi-1-ref.html --- a/layout/reftests/text/auto-hyphenation-fi-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-fi-1-ref.html @@ -1,11 +1,11 @@
-Kaik­ki ih­mi­set syn­ty­vät va­pai­na ja ta­sa­ver­tai­si­na ar­vol­taan ja oi­keuk­sil­taan. +Kaikki ih­mi­set syn­ty­vät va­pai­na ja ta­sa­ver­tai­si­na ar­vol­taan ja oi­keuk­sil­taan.
diff --git a/layout/reftests/text/auto-hyphenation-gl-1-ref.html b/layout/reftests/text/auto-hyphenation-gl-1-ref.html --- a/layout/reftests/text/auto-hyphenation-gl-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-gl-1-ref.html @@ -1,11 +1,11 @@
-Tó­do­los se­res hu­ma­nos na­cen li­bres e iguais en dig­ni­da­de e de­rei­tos +Tódolos se­res hu­ma­nos na­cen li­bres e iguais en dig­ni­da­de e de­rei­tos
diff --git a/layout/reftests/text/auto-hyphenation-hu-1-ref.html b/layout/reftests/text/auto-hyphenation-hu-1-ref.html --- a/layout/reftests/text/auto-hyphenation-hu-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-hu-1-ref.html @@ -1,11 +1,11 @@
-Min­den em­be­ri lény sza­ba­don szü­le­tik és egyen­lő mél­tó­sá­ga és jo­ga van. +Minden em­be­ri lény sza­ba­don szü­le­tik és egyen­lő mél­tó­sá­ga és jo­ga van.
diff --git a/layout/reftests/text/auto-hyphenation-ia-1-ref.html b/layout/reftests/text/auto-hyphenation-ia-1-ref.html --- a/layout/reftests/text/auto-hyphenation-ia-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-ia-1-ref.html @@ -1,11 +1,11 @@
-To­te le es­se­res hu­man na­sce li­be­re e equal in dig­ni­ta­te e in de­rec­tos +Tote le es­se­res hu­man na­sce li­be­re e equal in dig­ni­ta­te e in de­rec­tos
diff --git a/layout/reftests/text/auto-hyphenation-it-1-ref.html b/layout/reftests/text/auto-hyphenation-it-1-ref.html --- a/layout/reftests/text/auto-hyphenation-it-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-it-1-ref.html @@ -1,11 +1,11 @@
-Tut­ti gli es­se­ri uma­ni na­sco­no li­be­ri ed egua­li in di­gni­tà e di­rit­ti. +Tutti gli es­se­ri uma­ni na­sco­no li­be­ri ed egua­li in di­gni­tà e di­rit­ti.
diff --git a/layout/reftests/text/auto-hyphenation-kmr-1-ref.html b/layout/reftests/text/auto-hyphenation-kmr-1-ref.html --- a/layout/reftests/text/auto-hyphenation-kmr-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-kmr-1-ref.html @@ -1,11 +1,11 @@
-He­mû mi­rov azad û di we­qar û ma­fan de we­k­hev tên din­ya­yê +Hemû mi­rov azad û di we­qar û ma­fan de we­k­hev tên din­ya­yê
diff --git a/layout/reftests/text/auto-hyphenation-la-1-ref.html b/layout/reftests/text/auto-hyphenation-la-1-ref.html --- a/layout/reftests/text/auto-hyphenation-la-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-la-1-ref.html @@ -1,11 +1,11 @@
-Om­nes ho­mi­nes di­gni­ta­te et iu­re li­be­ri et pa­res na­scun­tur +Omnes ho­mi­nes di­gni­ta­te et iu­re li­be­ri et pa­res na­scun­tur
diff --git a/layout/reftests/text/auto-hyphenation-lt-1-ref.html b/layout/reftests/text/auto-hyphenation-lt-1-ref.html --- a/layout/reftests/text/auto-hyphenation-lt-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-lt-1-ref.html @@ -1,11 +1,11 @@
-Vi­si žmo­nės gims­ta lais­vi ir ly­gūs sa­vo oru­mu ir tei­sė­mis. +Visi žmo­nės gims­ta lais­vi ir ly­gūs sa­vo oru­mu ir tei­sė­mis.
diff --git a/layout/reftests/text/auto-hyphenation-nl-1-ref.html b/layout/reftests/text/auto-hyphenation-nl-1-ref.html --- a/layout/reftests/text/auto-hyphenation-nl-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-nl-1-ref.html @@ -1,11 +1,11 @@
-Al­le men­sen wor­den vrij en ge­lijk in waar­dig­heid en rech­ten ge­bo­ren +Alle men­sen wor­den vrij en ge­lijk in waar­dig­heid en rech­ten ge­bo­ren
diff --git a/layout/reftests/text/auto-hyphenation-pl-1-ref.html b/layout/reftests/text/auto-hyphenation-pl-1-ref.html --- a/layout/reftests/text/auto-hyphenation-pl-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-pl-1-ref.html @@ -6,18 +6,17 @@ -
Uni- -kod +
Unikod przy- pi- su- je uni- kal- ny nu- diff --git a/layout/reftests/text/auto-hyphenation-pt-1-ref.html b/layout/reftests/text/auto-hyphenation-pt-1-ref.html --- a/layout/reftests/text/auto-hyphenation-pt-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-pt-1-ref.html @@ -1,11 +1,11 @@
-To­dos os se­res hu­ma­nos nas­cem li­vres e iguais em dig­ni­da­de e em di­rei­tos +Todos os se­res hu­ma­nos nas­cem li­vres e iguais em dig­ni­da­de e em di­rei­tos
diff --git a/layout/reftests/text/auto-hyphenation-sv-1-ref.html b/layout/reftests/text/auto-hyphenation-sv-1-ref.html --- a/layout/reftests/text/auto-hyphenation-sv-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-sv-1-ref.html @@ -1,11 +1,11 @@
-Al­la män­ni­skor äro föd­da fria och li­ka i vär­de och rät­tig­he­ter +Alla män­ni­skor äro föd­da fria och li­ka i vär­de och rät­tig­he­ter
diff --git a/layout/reftests/text/auto-hyphenation-tr-1-ref.html b/layout/reftests/text/auto-hyphenation-tr-1-ref.html --- a/layout/reftests/text/auto-hyphenation-tr-1-ref.html +++ b/layout/reftests/text/auto-hyphenation-tr-1-ref.html @@ -1,11 +1,11 @@
-Bü­tün in­san­lar hür, hay­si­yet ve hak­lar ba­kı­mın­dan eşit do­ğar­lar. +Bütün in­san­lar hür, hay­si­yet ve hak­lar ba­kı­mın­dan eşit do­ğar­lar.
diff --git a/layout/reftests/text/hyphenation-control-3-ref.html b/layout/reftests/text/hyphenation-control-3-ref.html --- a/layout/reftests/text/hyphenation-control-3-ref.html +++ b/layout/reftests/text/hyphenation-control-3-ref.html @@ -6,12 +6,12 @@ code { display:block; hyphens:manual; border: 1px solid black; } -ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTU-
VWXYZsupercalifragilisticexpialidocious-
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ +abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstu-
vwxyzsupercalifragilisticexpialidocious-
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz
diff --git a/layout/reftests/text/hyphenation-control-3.html b/layout/reftests/text/hyphenation-control-3.html --- a/layout/reftests/text/hyphenation-control-3.html +++ b/layout/reftests/text/hyphenation-control-3.html @@ -10,12 +10,12 @@ code { -ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZsuper­cali­fragi­listic­expiali­docious­ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ +abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzsuper­cali­fragi­listic­expiali­docious­abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz diff --git a/modules/libpref/init/all.js b/modules/libpref/init/all.js --- a/modules/libpref/init/all.js +++ b/modules/libpref/init/all.js @@ -2480,16 +2480,21 @@ pref("intl.hyphenation-alias.bs-*", "sh" // Norwegian has two forms, Bokmål and Nynorsk, with "no" as a macrolanguage encompassing both. // For "no", we'll alias to "nb" (Bokmål) as that is the more widely used written form. pref("intl.hyphenation-alias.no", "nb"); pref("intl.hyphenation-alias.no-*", "nb"); pref("intl.hyphenation-alias.nb-*", "nb"); pref("intl.hyphenation-alias.nn-*", "nn"); +// In German, we allow hyphenation of capitalized words; otherwise not. +pref("intl.hyphenate-capitalized.de-1996", true); +pref("intl.hyphenate-capitalized.de-1901", true); +pref("intl.hyphenate-capitalized.de-CH", true); + // All prefs of default font should be "auto". pref("font.name.serif.ar", ""); pref("font.name.sans-serif.ar", ""); pref("font.name.monospace.ar", ""); pref("font.name.cursive.ar", ""); pref("font.name.serif.el", ""); pref("font.name.sans-serif.el", "");