patches/brkitr.patch - third_party/icu - Git at Google

 Index: source/data/brkitr/word.txt
 ===================================================================
 --- source/data/brkitr/word.txt	(revision 259715)
 +++ source/data/brkitr/word.txt	(working copy)
 @@ -35,10 +35,16 @@
  $ALetter            = [\p{Word_Break = ALetter}];
  $Single_Quote       = [\p{Word_Break = Single_Quote}];
  $Double_Quote       = [\p{Word_Break = Double_Quote}];
 -$MidNumLet          = [\p{Word_Break = MidNumLet}];
 +# Remove two full stop characters from $MidNumLet and add them to $MidNum
 +# to break a hostname into its components at the cost of breaking
 +# 'e.g.' and 'i.e.' as well.
 +# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.
 +# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected
 +# while rules 6/7 are reverted to the old behavior we want.
 +$MidNumLet    = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
  $MidLetter          = [\p{Word_Break = MidLetter}];
 -$MidNum             = [\p{Word_Break = MidNum}];
 -$Numeric            = [\p{Word_Break = Numeric}];
 +$MidNum       = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
 +$Numeric      = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits
  $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];

  $Han                = [:Han:];
 Index: source/data/brkitr/line.txt
 ===================================================================
 --- source/data/brkitr/line.txt	(revision 259715)
 +++ source/data/brkitr/line.txt	(working copy)
 @@ -12,9 +12,8 @@
  #         This is only because of a limitation of ICU break engine implementation,
  #         not because the older behavior is desirable.

 -#
 -#  Character Classes defined by TR 14.
 -#
 +# CHROME: 1. Use line_ja.txt to apply small kana rules in all locales.
 +#         2. Adjust CL, OP, and IS to handle 'comma-variants' consistently.

  !!chain;
  !!LBCMNoChain;
 @@ -57,14 +56,14 @@
  #

  $AI = [:LineBreak =  Ambiguous:];
 -$AL = [:LineBreak =  Alphabetic:];
 +$AL = [[:LineBreak =  Alphabetic:] - [\u23B4\u23B5]];
  $BA = [:LineBreak =  Break_After:];
  $BB = [:LineBreak =  Break_Before:];
  $BK = [:LineBreak =  Mandatory_Break:];
  $B2 = [:LineBreak =  Break_Both:];
  $CB = [:LineBreak =  Contingent_Break:];
  $CJ = [:LineBreak =  Conditional_Japanese_Starter:];
 -$CL = [:LineBreak =  Close_Punctuation:];
 +$CL = [[:LineBreak =  Close_Punctuation:] [\uFE51\uFE10\u23B5]];
  $CM = [:LineBreak =  Combining_Mark:];
  $CP = [:LineBreak =  Close_Parenthesis:];
  $CR = [:LineBreak =  Carriage_Return:];
 @@ -74,16 +73,16 @@
  $HY = [:LineBreak =  Hyphen:];
  $H2 = [:LineBreak =  H2:];
  $H3 = [:LineBreak =  H3:];
 -$ID = [:LineBreak =  Ideographic:];
 +$ID = [[[:LineBreak =  Ideographic:] $CJ] - [\uFE51]];
  $IN = [:LineBreak =  Inseperable:];
 -$IS = [:LineBreak =  Infix_Numeric:];
 +$IS = [[:LineBreak =  Infix_Numeric:] - [\uFE10]];
  $JL = [:LineBreak =  JL:];
  $JV = [:LineBreak =  JV:];
  $JT = [:LineBreak =  JT:];
  $LF = [:LineBreak =  Line_Feed:];
  $NL = [:LineBreak =  Next_Line:];
 -$NS = [[:LineBreak =  Nonstarter:] $CJ];
 +$NS = [:LineBreak =  Nonstarter:];
  $NU = [:LineBreak =  Numeric:];
 -$OP = [:LineBreak =  Open_Punctuation:];
 +$OP = [[:LineBreak =  Open_Punctuation:] \u23B4];
  $PO = [:LineBreak =  Postfix_Numeric:];
  $PR = [:LineBreak =  Prefix_Numeric:];
	Index: source/data/brkitr/word.txt
	===================================================================
	--- source/data/brkitr/word.txt (revision 259715)
	+++ source/data/brkitr/word.txt (working copy)
	@@ -35,10 +35,16 @@
	$ALetter = [\p{Word_Break = ALetter}];
	$Single_Quote = [\p{Word_Break = Single_Quote}];
	$Double_Quote = [\p{Word_Break = Double_Quote}];
	-$MidNumLet = [\p{Word_Break = MidNumLet}];
	+# Remove two full stop characters from $MidNumLet and add them to $MidNum
	+# to break a hostname into its components at the cost of breaking
	+# 'e.g.' and 'i.e.' as well.
	+# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.
	+# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected
	+# while rules 6/7 are reverted to the old behavior we want.
	+$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
	$MidLetter = [\p{Word_Break = MidLetter}];
	-$MidNum = [\p{Word_Break = MidNum}];
	-$Numeric = [\p{Word_Break = Numeric}];
	+$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
	+$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits
	$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];

	$Han = [:Han:];
	Index: source/data/brkitr/line.txt
	===================================================================
	--- source/data/brkitr/line.txt (revision 259715)
	+++ source/data/brkitr/line.txt (working copy)
	@@ -12,9 +12,8 @@
	# This is only because of a limitation of ICU break engine implementation,
	# not because the older behavior is desirable.

	-#
	-# Character Classes defined by TR 14.
	-#
	+# CHROME: 1. Use line_ja.txt to apply small kana rules in all locales.
	+# 2. Adjust CL, OP, and IS to handle 'comma-variants' consistently.

	!!chain;
	!!LBCMNoChain;
	@@ -57,14 +56,14 @@
	#

	$AI = [:LineBreak = Ambiguous:];
	-$AL = [:LineBreak = Alphabetic:];
	+$AL = [[:LineBreak = Alphabetic:] - [\u23B4\u23B5]];
	$BA = [:LineBreak = Break_After:];
	$BB = [:LineBreak = Break_Before:];
	$BK = [:LineBreak = Mandatory_Break:];
	$B2 = [:LineBreak = Break_Both:];
	$CB = [:LineBreak = Contingent_Break:];
	$CJ = [:LineBreak = Conditional_Japanese_Starter:];
	-$CL = [:LineBreak = Close_Punctuation:];
	+$CL = [[:LineBreak = Close_Punctuation:] [\uFE51\uFE10\u23B5]];
	$CM = [:LineBreak = Combining_Mark:];
	$CP = [:LineBreak = Close_Parenthesis:];
	$CR = [:LineBreak = Carriage_Return:];
	@@ -74,16 +73,16 @@
	$HY = [:LineBreak = Hyphen:];
	$H2 = [:LineBreak = H2:];
	$H3 = [:LineBreak = H3:];
	-$ID = [:LineBreak = Ideographic:];
	+$ID = [[[:LineBreak = Ideographic:] $CJ] - [\uFE51]];
	$IN = [:LineBreak = Inseperable:];
	-$IS = [:LineBreak = Infix_Numeric:];
	+$IS = [[:LineBreak = Infix_Numeric:] - [\uFE10]];
	$JL = [:LineBreak = JL:];
	$JV = [:LineBreak = JV:];
	$JT = [:LineBreak = JT:];
	$LF = [:LineBreak = Line_Feed:];
	$NL = [:LineBreak = Next_Line:];
	-$NS = [[:LineBreak = Nonstarter:] $CJ];
	+$NS = [:LineBreak = Nonstarter:];
	$NU = [:LineBreak = Numeric:];
	-$OP = [:LineBreak = Open_Punctuation:];
	+$OP = [[:LineBreak = Open_Punctuation:] \u23B4];
	$PO = [:LineBreak = Postfix_Numeric:];
	$PR = [:LineBreak = Prefix_Numeric:];