1package uniseg
2
3import "unicode/utf8"
4
5// FirstSentence returns the first sentence found in the given byte slice
6// according to the rules of [Unicode Standard Annex #29, Sentence Boundaries].
7// This function can be called continuously to extract all sentences from a byte
8// slice, as illustrated in the example below.
9//
10// If you don't know the current state, for example when calling the function
11// for the first time, you must pass -1. For consecutive calls, pass the state
12// and rest slice returned by the previous call.
13//
14// The "rest" slice is the sub-slice of the original byte slice "b" starting
15// after the last byte of the identified sentence. If the length of the "rest"
16// slice is 0, the entire byte slice "b" has been processed. The "sentence" byte
17// slice is the sub-slice of the input slice containing the identified sentence.
18//
19// Given an empty byte slice "b", the function returns nil values.
20//
21// [Unicode Standard Annex #29, Sentence Boundaries]: http://unicode.org/reports/tr29/#Sentence_Boundaries
22func FirstSentence(b []byte, state int) (sentence, rest []byte, newState int) {
23 // An empty byte slice returns nothing.
24 if len(b) == 0 {
25 return
26 }
27
28 // Extract the first rune.
29 r, length := utf8.DecodeRune(b)
30 if len(b) <= length { // If we're already past the end, there is nothing else to parse.
31 return b, nil, sbAny
32 }
33
34 // If we don't know the state, determine it now.
35 if state < 0 {
36 state, _ = transitionSentenceBreakState(state, r, b[length:], "")
37 }
38
39 // Transition until we find a boundary.
40 var boundary bool
41 for {
42 r, l := utf8.DecodeRune(b[length:])
43 state, boundary = transitionSentenceBreakState(state, r, b[length+l:], "")
44
45 if boundary {
46 return b[:length], b[length:], state
47 }
48
49 length += l
50 if len(b) <= length {
51 return b, nil, sbAny
52 }
53 }
54}
55
56// FirstSentenceInString is like [FirstSentence] but its input and outputs are
57// strings.
58func FirstSentenceInString(str string, state int) (sentence, rest string, newState int) {
59 // An empty byte slice returns nothing.
60 if len(str) == 0 {
61 return
62 }
63
64 // Extract the first rune.
65 r, length := utf8.DecodeRuneInString(str)
66 if len(str) <= length { // If we're already past the end, there is nothing else to parse.
67 return str, "", sbAny
68 }
69
70 // If we don't know the state, determine it now.
71 if state < 0 {
72 state, _ = transitionSentenceBreakState(state, r, nil, str[length:])
73 }
74
75 // Transition until we find a boundary.
76 var boundary bool
77 for {
78 r, l := utf8.DecodeRuneInString(str[length:])
79 state, boundary = transitionSentenceBreakState(state, r, nil, str[length+l:])
80
81 if boundary {
82 return str[:length], str[length:], state
83 }
84
85 length += l
86 if len(str) <= length {
87 return str, "", sbAny
88 }
89 }
90}