1package uniseg
2
3import "unicode/utf8"
4
5// FirstLineSegment returns the prefix of the given byte slice after which a
6// decision to break the string over to the next line can or must be made,
7// according to the rules of [Unicode Standard Annex #14]. This is used to
8// implement line breaking.
9//
10// Line breaking, also known as word wrapping, is the process of breaking a
11// section of text into lines such that it will fit in the available width of a
12// page, window or other display area.
13//
14// The returned "segment" may not be broken into smaller parts, unless no other
15// breaking opportunities present themselves, in which case you may break by
16// grapheme clusters (using the [FirstGraphemeCluster] function to determine the
17// grapheme clusters).
18//
19// The "mustBreak" flag indicates whether you MUST break the line after the
20// given segment (true), for example after newline characters, or you MAY break
21// the line after the given segment (false).
22//
23// This function can be called continuously to extract all non-breaking sub-sets
24// from a byte slice, as illustrated in the example below.
25//
26// If you don't know the current state, for example when calling the function
27// for the first time, you must pass -1. For consecutive calls, pass the state
28// and rest slice returned by the previous call.
29//
30// The "rest" slice is the sub-slice of the original byte slice "b" starting
31// after the last byte of the identified line segment. If the length of the
32// "rest" slice is 0, the entire byte slice "b" has been processed. The
33// "segment" byte slice is the sub-slice of the input slice containing the
34// identified line segment.
35//
36// Given an empty byte slice "b", the function returns nil values.
37//
38// Note that in accordance with [UAX #14 LB3], the final segment will end with
39// "mustBreak" set to true. You can choose to ignore this by checking if the
40// length of the "rest" slice is 0 and calling [HasTrailingLineBreak] or
41// [HasTrailingLineBreakInString] on the last rune.
42//
43// Note also that this algorithm may break within grapheme clusters. This is
44// addressed in Section 8.2 Example 6 of UAX #14. To avoid this, you can use
45// the [Step] function instead.
46//
47// [Unicode Standard Annex #14]: https://www.unicode.org/reports/tr14/
48// [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm
49func FirstLineSegment(b []byte, state int) (segment, rest []byte, mustBreak bool, newState int) {
50 // An empty byte slice returns nothing.
51 if len(b) == 0 {
52 return
53 }
54
55 // Extract the first rune.
56 r, length := utf8.DecodeRune(b)
57 if len(b) <= length { // If we're already past the end, there is nothing else to parse.
58 return b, nil, true, lbAny // LB3.
59 }
60
61 // If we don't know the state, determine it now.
62 if state < 0 {
63 state, _ = transitionLineBreakState(state, r, b[length:], "")
64 }
65
66 // Transition until we find a boundary.
67 var boundary int
68 for {
69 r, l := utf8.DecodeRune(b[length:])
70 state, boundary = transitionLineBreakState(state, r, b[length+l:], "")
71
72 if boundary != LineDontBreak {
73 return b[:length], b[length:], boundary == LineMustBreak, state
74 }
75
76 length += l
77 if len(b) <= length {
78 return b, nil, true, lbAny // LB3
79 }
80 }
81}
82
83// FirstLineSegmentInString is like [FirstLineSegment] but its input and outputs
84// are strings.
85func FirstLineSegmentInString(str string, state int) (segment, rest string, mustBreak bool, newState int) {
86 // An empty byte slice returns nothing.
87 if len(str) == 0 {
88 return
89 }
90
91 // Extract the first rune.
92 r, length := utf8.DecodeRuneInString(str)
93 if len(str) <= length { // If we're already past the end, there is nothing else to parse.
94 return str, "", true, lbAny // LB3.
95 }
96
97 // If we don't know the state, determine it now.
98 if state < 0 {
99 state, _ = transitionLineBreakState(state, r, nil, str[length:])
100 }
101
102 // Transition until we find a boundary.
103 var boundary int
104 for {
105 r, l := utf8.DecodeRuneInString(str[length:])
106 state, boundary = transitionLineBreakState(state, r, nil, str[length+l:])
107
108 if boundary != LineDontBreak {
109 return str[:length], str[length:], boundary == LineMustBreak, state
110 }
111
112 length += l
113 if len(str) <= length {
114 return str, "", true, lbAny // LB3.
115 }
116 }
117}
118
119// HasTrailingLineBreak returns true if the last rune in the given byte slice is
120// one of the hard line break code points defined in LB4 and LB5 of [UAX #14].
121//
122// [UAX #14]: https://www.unicode.org/reports/tr14/#Algorithm
123func HasTrailingLineBreak(b []byte) bool {
124 r, _ := utf8.DecodeLastRune(b)
125 property, _ := propertyLineBreak(r)
126 return property == prBK || property == prCR || property == prLF || property == prNL
127}
128
129// HasTrailingLineBreakInString is like [HasTrailingLineBreak] but for a string.
130func HasTrailingLineBreakInString(str string) bool {
131 r, _ := utf8.DecodeLastRuneInString(str)
132 property, _ := propertyLineBreak(r)
133 return property == prBK || property == prCR || property == prLF || property == prNL
134}