1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package cases
6
7// This file contains the definitions of case mappings for all supported
8// languages. The rules for the language-specific tailorings were taken and
9// modified from the CLDR transform definitions in common/transforms.
10
11import (
12 "strings"
13 "unicode"
14 "unicode/utf8"
15
16 "golang.org/x/text/internal"
17 "golang.org/x/text/language"
18 "golang.org/x/text/transform"
19 "golang.org/x/text/unicode/norm"
20)
21
22// A mapFunc takes a context set to the current rune and writes the mapped
23// version to the same context. It may advance the context to the next rune. It
24// returns whether a checkpoint is possible: whether the pDst bytes written to
25// dst so far won't need changing as we see more source bytes.
26type mapFunc func(*context) bool
27
28// A spanFunc takes a context set to the current rune and returns whether this
29// rune would be altered when written to the output. It may advance the context
30// to the next rune. It returns whether a checkpoint is possible.
31type spanFunc func(*context) bool
32
33// maxIgnorable defines the maximum number of ignorables to consider for
34// lookahead operations.
35const maxIgnorable = 30
36
37// supported lists the language tags for which we have tailorings.
38const supported = "und af az el lt nl tr"
39
40func init() {
41 tags := []language.Tag{}
42 for _, s := range strings.Split(supported, " ") {
43 tags = append(tags, language.MustParse(s))
44 }
45 matcher = internal.NewInheritanceMatcher(tags)
46 Supported = language.NewCoverage(tags)
47}
48
49var (
50 matcher *internal.InheritanceMatcher
51
52 Supported language.Coverage
53
54 // We keep the following lists separate, instead of having a single per-
55 // language struct, to give the compiler a chance to remove unused code.
56
57 // Some uppercase mappers are stateless, so we can precompute the
58 // Transformers and save a bit on runtime allocations.
59 upperFunc = []struct {
60 upper mapFunc
61 span spanFunc
62 }{
63 {nil, nil}, // und
64 {nil, nil}, // af
65 {aztrUpper(upper), isUpper}, // az
66 {elUpper, noSpan}, // el
67 {ltUpper(upper), noSpan}, // lt
68 {nil, nil}, // nl
69 {aztrUpper(upper), isUpper}, // tr
70 }
71
72 undUpper transform.SpanningTransformer = &undUpperCaser{}
73 undLower transform.SpanningTransformer = &undLowerCaser{}
74 undLowerIgnoreSigma transform.SpanningTransformer = &undLowerIgnoreSigmaCaser{}
75
76 lowerFunc = []mapFunc{
77 nil, // und
78 nil, // af
79 aztrLower, // az
80 nil, // el
81 ltLower, // lt
82 nil, // nl
83 aztrLower, // tr
84 }
85
86 titleInfos = []struct {
87 title mapFunc
88 lower mapFunc
89 titleSpan spanFunc
90 rewrite func(*context)
91 }{
92 {title, lower, isTitle, nil}, // und
93 {title, lower, isTitle, afnlRewrite}, // af
94 {aztrUpper(title), aztrLower, isTitle, nil}, // az
95 {title, lower, isTitle, nil}, // el
96 {ltUpper(title), ltLower, noSpan, nil}, // lt
97 {nlTitle, lower, nlTitleSpan, afnlRewrite}, // nl
98 {aztrUpper(title), aztrLower, isTitle, nil}, // tr
99 }
100)
101
102func makeUpper(t language.Tag, o options) transform.SpanningTransformer {
103 _, i, _ := matcher.Match(t)
104 f := upperFunc[i].upper
105 if f == nil {
106 return undUpper
107 }
108 return &simpleCaser{f: f, span: upperFunc[i].span}
109}
110
111func makeLower(t language.Tag, o options) transform.SpanningTransformer {
112 _, i, _ := matcher.Match(t)
113 f := lowerFunc[i]
114 if f == nil {
115 if o.ignoreFinalSigma {
116 return undLowerIgnoreSigma
117 }
118 return undLower
119 }
120 if o.ignoreFinalSigma {
121 return &simpleCaser{f: f, span: isLower}
122 }
123 return &lowerCaser{
124 first: f,
125 midWord: finalSigma(f),
126 }
127}
128
129func makeTitle(t language.Tag, o options) transform.SpanningTransformer {
130 _, i, _ := matcher.Match(t)
131 x := &titleInfos[i]
132 lower := x.lower
133 if o.noLower {
134 lower = (*context).copy
135 } else if !o.ignoreFinalSigma {
136 lower = finalSigma(lower)
137 }
138 return &titleCaser{
139 title: x.title,
140 lower: lower,
141 titleSpan: x.titleSpan,
142 rewrite: x.rewrite,
143 }
144}
145
146func noSpan(c *context) bool {
147 c.err = transform.ErrEndOfSpan
148 return false
149}
150
151// TODO: consider a similar special case for the fast majority lower case. This
152// is a bit more involved so will require some more precise benchmarking to
153// justify it.
154
155type undUpperCaser struct{ transform.NopResetter }
156
157// undUpperCaser implements the Transformer interface for doing an upper case
158// mapping for the root locale (und). It eliminates the need for an allocation
159// as it prevents escaping by not using function pointers.
160func (t undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
161 c := context{dst: dst, src: src, atEOF: atEOF}
162 for c.next() {
163 upper(&c)
164 c.checkpoint()
165 }
166 return c.ret()
167}
168
169func (t undUpperCaser) Span(src []byte, atEOF bool) (n int, err error) {
170 c := context{src: src, atEOF: atEOF}
171 for c.next() && isUpper(&c) {
172 c.checkpoint()
173 }
174 return c.retSpan()
175}
176
177// undLowerIgnoreSigmaCaser implements the Transformer interface for doing
178// a lower case mapping for the root locale (und) ignoring final sigma
179// handling. This casing algorithm is used in some performance-critical packages
180// like secure/precis and x/net/http/idna, which warrants its special-casing.
181type undLowerIgnoreSigmaCaser struct{ transform.NopResetter }
182
183func (t undLowerIgnoreSigmaCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
184 c := context{dst: dst, src: src, atEOF: atEOF}
185 for c.next() && lower(&c) {
186 c.checkpoint()
187 }
188 return c.ret()
189
190}
191
192// Span implements a generic lower-casing. This is possible as isLower works
193// for all lowercasing variants. All lowercase variants only vary in how they
194// transform a non-lowercase letter. They will never change an already lowercase
195// letter. In addition, there is no state.
196func (t undLowerIgnoreSigmaCaser) Span(src []byte, atEOF bool) (n int, err error) {
197 c := context{src: src, atEOF: atEOF}
198 for c.next() && isLower(&c) {
199 c.checkpoint()
200 }
201 return c.retSpan()
202}
203
204type simpleCaser struct {
205 context
206 f mapFunc
207 span spanFunc
208}
209
210// simpleCaser implements the Transformer interface for doing a case operation
211// on a rune-by-rune basis.
212func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
213 c := context{dst: dst, src: src, atEOF: atEOF}
214 for c.next() && t.f(&c) {
215 c.checkpoint()
216 }
217 return c.ret()
218}
219
220func (t *simpleCaser) Span(src []byte, atEOF bool) (n int, err error) {
221 c := context{src: src, atEOF: atEOF}
222 for c.next() && t.span(&c) {
223 c.checkpoint()
224 }
225 return c.retSpan()
226}
227
228// undLowerCaser implements the Transformer interface for doing a lower case
229// mapping for the root locale (und) ignoring final sigma handling. This casing
230// algorithm is used in some performance-critical packages like secure/precis
231// and x/net/http/idna, which warrants its special-casing.
232type undLowerCaser struct{ transform.NopResetter }
233
234func (t undLowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
235 c := context{dst: dst, src: src, atEOF: atEOF}
236
237 for isInterWord := true; c.next(); {
238 if isInterWord {
239 if c.info.isCased() {
240 if !lower(&c) {
241 break
242 }
243 isInterWord = false
244 } else if !c.copy() {
245 break
246 }
247 } else {
248 if c.info.isNotCasedAndNotCaseIgnorable() {
249 if !c.copy() {
250 break
251 }
252 isInterWord = true
253 } else if !c.hasPrefix("Ī£") {
254 if !lower(&c) {
255 break
256 }
257 } else if !finalSigmaBody(&c) {
258 break
259 }
260 }
261 c.checkpoint()
262 }
263 return c.ret()
264}
265
266func (t undLowerCaser) Span(src []byte, atEOF bool) (n int, err error) {
267 c := context{src: src, atEOF: atEOF}
268 for c.next() && isLower(&c) {
269 c.checkpoint()
270 }
271 return c.retSpan()
272}
273
274// lowerCaser implements the Transformer interface. The default Unicode lower
275// casing requires different treatment for the first and subsequent characters
276// of a word, most notably to handle the Greek final Sigma.
277type lowerCaser struct {
278 undLowerIgnoreSigmaCaser
279
280 context
281
282 first, midWord mapFunc
283}
284
285func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
286 t.context = context{dst: dst, src: src, atEOF: atEOF}
287 c := &t.context
288
289 for isInterWord := true; c.next(); {
290 if isInterWord {
291 if c.info.isCased() {
292 if !t.first(c) {
293 break
294 }
295 isInterWord = false
296 } else if !c.copy() {
297 break
298 }
299 } else {
300 if c.info.isNotCasedAndNotCaseIgnorable() {
301 if !c.copy() {
302 break
303 }
304 isInterWord = true
305 } else if !t.midWord(c) {
306 break
307 }
308 }
309 c.checkpoint()
310 }
311 return c.ret()
312}
313
314// titleCaser implements the Transformer interface. Title casing algorithms
315// distinguish between the first letter of a word and subsequent letters of the
316// same word. It uses state to avoid requiring a potentially infinite lookahead.
317type titleCaser struct {
318 context
319
320 // rune mappings used by the actual casing algorithms.
321 title mapFunc
322 lower mapFunc
323 titleSpan spanFunc
324
325 rewrite func(*context)
326}
327
328// Transform implements the standard Unicode title case algorithm as defined in
329// Chapter 3 of The Unicode Standard:
330// toTitlecase(X): Find the word boundaries in X according to Unicode Standard
331// Annex #29, "Unicode Text Segmentation." For each word boundary, find the
332// first cased character F following the word boundary. If F exists, map F to
333// Titlecase_Mapping(F); then map all characters C between F and the following
334// word boundary to Lowercase_Mapping(C).
335func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
336 t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord}
337 c := &t.context
338
339 if !c.next() {
340 return c.ret()
341 }
342
343 for {
344 p := c.info
345 if t.rewrite != nil {
346 t.rewrite(c)
347 }
348
349 wasMid := p.isMid()
350 // Break out of this loop on failure to ensure we do not modify the
351 // state incorrectly.
352 if p.isCased() {
353 if !c.isMidWord {
354 if !t.title(c) {
355 break
356 }
357 c.isMidWord = true
358 } else if !t.lower(c) {
359 break
360 }
361 } else if !c.copy() {
362 break
363 } else if p.isBreak() {
364 c.isMidWord = false
365 }
366
367 // As we save the state of the transformer, it is safe to call
368 // checkpoint after any successful write.
369 if !(c.isMidWord && wasMid) {
370 c.checkpoint()
371 }
372
373 if !c.next() {
374 break
375 }
376 if wasMid && c.info.isMid() {
377 c.isMidWord = false
378 }
379 }
380 return c.ret()
381}
382
383func (t *titleCaser) Span(src []byte, atEOF bool) (n int, err error) {
384 t.context = context{src: src, atEOF: atEOF, isMidWord: t.isMidWord}
385 c := &t.context
386
387 if !c.next() {
388 return c.retSpan()
389 }
390
391 for {
392 p := c.info
393 if t.rewrite != nil {
394 t.rewrite(c)
395 }
396
397 wasMid := p.isMid()
398 // Break out of this loop on failure to ensure we do not modify the
399 // state incorrectly.
400 if p.isCased() {
401 if !c.isMidWord {
402 if !t.titleSpan(c) {
403 break
404 }
405 c.isMidWord = true
406 } else if !isLower(c) {
407 break
408 }
409 } else if p.isBreak() {
410 c.isMidWord = false
411 }
412 // As we save the state of the transformer, it is safe to call
413 // checkpoint after any successful write.
414 if !(c.isMidWord && wasMid) {
415 c.checkpoint()
416 }
417
418 if !c.next() {
419 break
420 }
421 if wasMid && c.info.isMid() {
422 c.isMidWord = false
423 }
424 }
425 return c.retSpan()
426}
427
428// finalSigma adds Greek final Sigma handing to another casing function. It
429// determines whether a lowercased sigma should be Ļ or Ļ, by looking ahead for
430// case-ignorables and a cased letters.
431func finalSigma(f mapFunc) mapFunc {
432 return func(c *context) bool {
433 if !c.hasPrefix("Ī£") {
434 return f(c)
435 }
436 return finalSigmaBody(c)
437 }
438}
439
440func finalSigmaBody(c *context) bool {
441 // Current rune must be ā.
442
443 // ::NFD();
444 // # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
445 // Ī£ } [:case-ignorable:]* [:cased:] ā Ļ;
446 // [:cased:] [:case-ignorable:]* { Ī£ ā Ļ;
447 // ::Any-Lower;
448 // ::NFC();
449
450 p := c.pDst
451 c.writeString("Ļ")
452
453 // TODO: we should do this here, but right now this will never have an
454 // effect as this is called when the prefix is Sigma, whereas Dutch and
455 // Afrikaans only test for an apostrophe.
456 //
457 // if t.rewrite != nil {
458 // t.rewrite(c)
459 // }
460
461 // We need to do one more iteration after maxIgnorable, as a cased
462 // letter is not an ignorable and may modify the result.
463 wasMid := false
464 for i := 0; i < maxIgnorable+1; i++ {
465 if !c.next() {
466 return false
467 }
468 if !c.info.isCaseIgnorable() {
469 // All Midword runes are also case ignorable, so we are
470 // guaranteed to have a letter or word break here. As we are
471 // unreading the run, there is no need to unset c.isMidWord;
472 // the title caser will handle this.
473 if c.info.isCased() {
474 // p+1 is guaranteed to be in bounds: if writing Ļ was
475 // successful, p+1 will contain the second byte of Ļ. If not,
476 // this function will have returned after c.next returned false.
477 c.dst[p+1]++ // Ļ ā Ļ
478 }
479 c.unreadRune()
480 return true
481 }
482 // A case ignorable may also introduce a word break, so we may need
483 // to continue searching even after detecting a break.
484 isMid := c.info.isMid()
485 if (wasMid && isMid) || c.info.isBreak() {
486 c.isMidWord = false
487 }
488 wasMid = isMid
489 c.copy()
490 }
491 return true
492}
493
494// finalSigmaSpan would be the same as isLower.
495
496// elUpper implements Greek upper casing, which entails removing a predefined
497// set of non-blocked modifiers. Note that these accents should not be removed
498// for title casing!
499// Example: "ĪĪ“ĻĻ" -> "ĪĪĪĪ£".
500func elUpper(c *context) bool {
501 // From CLDR:
502 // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] ā ;
503 // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 ā ;
504
505 r, _ := utf8.DecodeRune(c.src[c.pSrc:])
506 oldPDst := c.pDst
507 if !upper(c) {
508 return false
509 }
510 if !unicode.Is(unicode.Greek, r) {
511 return true
512 }
513 i := 0
514 // Take the properties of the uppercased rune that is already written to the
515 // destination. This saves us the trouble of having to uppercase the
516 // decomposed rune again.
517 if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil {
518 // Restore the destination position and process the decomposed rune.
519 r, sz := utf8.DecodeRune(b)
520 if r <= 0xFF { // See A.6.1
521 return true
522 }
523 c.pDst = oldPDst
524 // Insert the first rune and ignore the modifiers. See A.6.2.
525 c.writeBytes(b[:sz])
526 i = len(b[sz:]) / 2 // Greek modifiers are always of length 2.
527 }
528
529 for ; i < maxIgnorable && c.next(); i++ {
530 switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r {
531 // Above and Iota Subscript
532 case 0x0300, // U+0300 COMBINING GRAVE ACCENT
533 0x0301, // U+0301 COMBINING ACUTE ACCENT
534 0x0304, // U+0304 COMBINING MACRON
535 0x0306, // U+0306 COMBINING BREVE
536 0x0308, // U+0308 COMBINING DIAERESIS
537 0x0313, // U+0313 COMBINING COMMA ABOVE
538 0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE
539 0x0342, // U+0342 COMBINING GREEK PERISPOMENI
540 0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI
541 // No-op. Gobble the modifier.
542
543 default:
544 switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() {
545 case cccZero:
546 c.unreadRune()
547 return true
548
549 // We don't need to test for IotaSubscript as the only rune that
550 // qualifies (U+0345) was already excluded in the switch statement
551 // above. See A.4.
552
553 case cccAbove:
554 return c.copy()
555 default:
556 // Some other modifier. We're still allowed to gobble Greek
557 // modifiers after this.
558 c.copy()
559 }
560 }
561 }
562 return i == maxIgnorable
563}
564
565// TODO: implement elUpperSpan (low-priority: complex and infrequent).
566
567func ltLower(c *context) bool {
568 // From CLDR:
569 // # Introduce an explicit dot above when lowercasing capital I's and J's
570 // # whenever there are more accents above.
571 // # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
572 // # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
573 // # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
574 // # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
575 // # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
576 // # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
577 // # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
578 // ::NFD();
579 // I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] ā i \u0307;
580 // J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] ā j \u0307;
581 // I \u0328 (Ä®) } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] ā i \u0328 \u0307;
582 // I \u0300 (Ć) ā i \u0307 \u0300;
583 // I \u0301 (Ć) ā i \u0307 \u0301;
584 // I \u0303 (ÄØ) ā i \u0307 \u0303;
585 // ::Any-Lower();
586 // ::NFC();
587
588 i := 0
589 if r := c.src[c.pSrc]; r < utf8.RuneSelf {
590 lower(c)
591 if r != 'I' && r != 'J' {
592 return true
593 }
594 } else {
595 p := norm.NFD.Properties(c.src[c.pSrc:])
596 if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') {
597 // UTF-8 optimization: the decomposition will only have an above
598 // modifier if the last rune of the decomposition is in [U+300-U+311].
599 // In all other cases, a decomposition starting with I is always
600 // an I followed by modifiers that are not cased themselves. See A.2.
601 if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4.
602 if !c.writeBytes(d[:1]) {
603 return false
604 }
605 c.dst[c.pDst-1] += 'a' - 'A' // lower
606
607 // Assumption: modifier never changes on lowercase. See A.1.
608 // Assumption: all modifiers added have CCC = Above. See A.2.3.
609 return c.writeString("\u0307") && c.writeBytes(d[1:])
610 }
611 // In all other cases the additional modifiers will have a CCC
612 // that is less than 230 (Above). We will insert the U+0307, if
613 // needed, after these modifiers so that a string in FCD form
614 // will remain so. See A.2.2.
615 lower(c)
616 i = 1
617 } else {
618 return lower(c)
619 }
620 }
621
622 for ; i < maxIgnorable && c.next(); i++ {
623 switch c.info.cccType() {
624 case cccZero:
625 c.unreadRune()
626 return true
627 case cccAbove:
628 return c.writeString("\u0307") && c.copy() // See A.1.
629 default:
630 c.copy() // See A.1.
631 }
632 }
633 return i == maxIgnorable
634}
635
636// ltLowerSpan would be the same as isLower.
637
638func ltUpper(f mapFunc) mapFunc {
639 return func(c *context) bool {
640 // Unicode:
641 // 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
642 //
643 // From CLDR:
644 // # Remove \u0307 following soft-dotteds (i, j, and the like), with possible
645 // # intervening non-230 marks.
646 // ::NFD();
647 // [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 ā ;
648 // ::Any-Upper();
649 // ::NFC();
650
651 // TODO: See A.5. A soft-dotted rune never has an exception. This would
652 // allow us to overload the exception bit and encode this property in
653 // info. Need to measure performance impact of this.
654 r, _ := utf8.DecodeRune(c.src[c.pSrc:])
655 oldPDst := c.pDst
656 if !f(c) {
657 return false
658 }
659 if !unicode.Is(unicode.Soft_Dotted, r) {
660 return true
661 }
662
663 // We don't need to do an NFD normalization, as a soft-dotted rune never
664 // contains U+0307. See A.3.
665
666 i := 0
667 for ; i < maxIgnorable && c.next(); i++ {
668 switch c.info.cccType() {
669 case cccZero:
670 c.unreadRune()
671 return true
672 case cccAbove:
673 if c.hasPrefix("\u0307") {
674 // We don't do a full NFC, but rather combine runes for
675 // some of the common cases. (Returning NFC or
676 // preserving normal form is neither a requirement nor
677 // a possibility anyway).
678 if !c.next() {
679 return false
680 }
681 if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc {
682 s := ""
683 switch c.src[c.pSrc+1] {
684 case 0x80: // U+0300 COMBINING GRAVE ACCENT
685 s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE
686 case 0x81: // U+0301 COMBINING ACUTE ACCENT
687 s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE
688 case 0x83: // U+0303 COMBINING TILDE
689 s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE
690 case 0x88: // U+0308 COMBINING DIAERESIS
691 s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS
692 default:
693 }
694 if s != "" {
695 c.pDst = oldPDst
696 return c.writeString(s)
697 }
698 }
699 }
700 return c.copy()
701 default:
702 c.copy()
703 }
704 }
705 return i == maxIgnorable
706 }
707}
708
709// TODO: implement ltUpperSpan (low priority: complex and infrequent).
710
711func aztrUpper(f mapFunc) mapFunc {
712 return func(c *context) bool {
713 // iāİ;
714 if c.src[c.pSrc] == 'i' {
715 return c.writeString("İ")
716 }
717 return f(c)
718 }
719}
720
721func aztrLower(c *context) (done bool) {
722 // From CLDR:
723 // # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
724 // # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
725 // İāi;
726 // # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
727 // # This matches the behavior of the canonically equivalent I-dot_above
728 // # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
729 // # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
730 // # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
731 // I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 ā i$1 ;
732 // Iāı ;
733 // ::Any-Lower();
734 if c.hasPrefix("\u0130") { // İ
735 return c.writeString("i")
736 }
737 if c.src[c.pSrc] != 'I' {
738 return lower(c)
739 }
740
741 // We ignore the lower-case I for now, but insert it later when we know
742 // which form we need.
743 start := c.pSrc + c.sz
744
745 i := 0
746Loop:
747 // We check for up to n ignorables before \u0307. As \u0307 is an
748 // ignorable as well, n is maxIgnorable-1.
749 for ; i < maxIgnorable && c.next(); i++ {
750 switch c.info.cccType() {
751 case cccAbove:
752 if c.hasPrefix("\u0307") {
753 return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307
754 }
755 done = true
756 break Loop
757 case cccZero:
758 c.unreadRune()
759 done = true
760 break Loop
761 default:
762 // We'll write this rune after we know which starter to use.
763 }
764 }
765 if i == maxIgnorable {
766 done = true
767 }
768 return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done
769}
770
771// aztrLowerSpan would be the same as isLower.
772
773func nlTitle(c *context) bool {
774 // From CLDR:
775 // # Special titlecasing for Dutch initial "ij".
776 // ::Any-Title();
777 // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
778 // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } ā IJ ;
779 if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' {
780 return title(c)
781 }
782
783 if !c.writeString("I") || !c.next() {
784 return false
785 }
786 if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' {
787 return c.writeString("J")
788 }
789 c.unreadRune()
790 return true
791}
792
793func nlTitleSpan(c *context) bool {
794 // From CLDR:
795 // # Special titlecasing for Dutch initial "ij".
796 // ::Any-Title();
797 // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
798 // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } ā IJ ;
799 if c.src[c.pSrc] != 'I' {
800 return isTitle(c)
801 }
802 if !c.next() || c.src[c.pSrc] == 'j' {
803 return false
804 }
805 if c.src[c.pSrc] != 'J' {
806 c.unreadRune()
807 }
808 return true
809}
810
811// Not part of CLDR, but see https://unicode.org/cldr/trac/ticket/7078.
812func afnlRewrite(c *context) {
813 if c.hasPrefix("'") || c.hasPrefix("ā") {
814 c.isMidWord = true
815 }
816}