Emoticons.java

  1/*
  2 * Copyright (c) 2017, Daniel Gultsch All rights reserved.
  3 *
  4 * Redistribution and use in source and binary forms, with or without modification,
  5 * are permitted provided that the following conditions are met:
  6 *
  7 * 1. Redistributions of source code must retain the above copyright notice, this
  8 * list of conditions and the following disclaimer.
  9 *
 10 * 2. Redistributions in binary form must reproduce the above copyright notice,
 11 * this list of conditions and the following disclaimer in the documentation and/or
 12 * other materials provided with the distribution.
 13 *
 14 * 3. Neither the name of the copyright holder nor the names of its contributors
 15 * may be used to endorse or promote products derived from this software without
 16 * specific prior written permission.
 17 *
 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 22 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 25 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 */
 29
 30package eu.siacs.conversations.utils;
 31
 32import java.util.ArrayList;
 33import java.util.Arrays;
 34import java.util.List;
 35import java.util.regex.Pattern;
 36
 37public class Emoticons {
 38
 39	private static final UnicodeRange MISC_SYMBOLS_AND_PICTOGRAPHS = new UnicodeRange(0x1F300,0x1F5FF);
 40	private static final UnicodeRange SUPPLEMENTAL_SYMBOLS = new UnicodeRange(0x1F900,0x1F9FF);
 41	private static final UnicodeRange EMOTICONS = new UnicodeRange(0x1F600,0x1F64F);
 42	private static final UnicodeRange TRANSPORT_SYMBOLS = new UnicodeRange(0x1F680,0x1F6FF);
 43	private static final UnicodeRange MISC_SYMBOLS = new UnicodeRange(0x2600,0x26FF);
 44	private static final UnicodeRange DINGBATS = new UnicodeRange(0x2700,0x27BF);
 45	private static final UnicodeRange ENCLOSED_ALPHANUMERIC_SUPPLEMENT = new UnicodeRange(0x1F100,0x1F1FF);
 46	private static final UnicodeRange ENCLOSED_IDEOGRAPHIC_SUPPLEMENT = new UnicodeRange(0x1F200,0x1F2FF);
 47	private static final UnicodeRange REGIONAL_INDICATORS = new UnicodeRange(0x1F1E6,0x1F1FF);
 48	private static final UnicodeRange GEOMETRIC_SHAPES = new UnicodeRange(0x25A0,0x25FF);
 49	private static final UnicodeRange LATIN_SUPPLEMENT = new UnicodeRange(0x80,0xFF);
 50	private static final UnicodeRange MISC_TECHNICAL = new UnicodeRange(0x2300,0x23FF);
 51	private static final UnicodeRange TAGS = new UnicodeRange(0xE0020,0xE007F);
 52	private static final UnicodeList CYK_SYMBOLS_AND_PUNCTUATION = new UnicodeList(0x3030,0x303D);
 53	private static final UnicodeList LETTERLIKE_SYMBOLS = new UnicodeList(0x2122,0x2139);
 54
 55	private static final UnicodeBlocks KEYCAP_COMBINEABLE = new UnicodeBlocks(new UnicodeList(0x23),new UnicodeList(0x2A),new UnicodeRange(0x30,0x39));
 56
 57	private static final UnicodeBlocks SYMBOLIZE = new UnicodeBlocks(
 58			GEOMETRIC_SHAPES,
 59			LATIN_SUPPLEMENT,
 60			CYK_SYMBOLS_AND_PUNCTUATION,
 61			LETTERLIKE_SYMBOLS,
 62			KEYCAP_COMBINEABLE);
 63	private static final UnicodeBlocks EMOJIS = new UnicodeBlocks(
 64			MISC_SYMBOLS_AND_PICTOGRAPHS,
 65			SUPPLEMENTAL_SYMBOLS,
 66			EMOTICONS,
 67			TRANSPORT_SYMBOLS,
 68			MISC_SYMBOLS,
 69			DINGBATS,
 70			ENCLOSED_ALPHANUMERIC_SUPPLEMENT,
 71			ENCLOSED_IDEOGRAPHIC_SUPPLEMENT,
 72			MISC_TECHNICAL);
 73
 74	private static final int ZWJ = 0x200D;
 75	private static final int VARIATION_16 = 0xFE0F;
 76	private static final int COMBINING_ENCLOSING_KEYCAP = 0x20E3;
 77	private static final int BLACK_FLAG = 0x1F3F4;
 78	private static final UnicodeRange FITZPATRICK = new UnicodeRange(0x1F3FB,0x1F3FF);
 79
 80	private static List<Symbol> parse(String input) {
 81		List<Symbol> symbols = new ArrayList<>();
 82		Builder builder = new Builder();
 83		boolean needsFinalBuild = false;
 84		for (int cp, i = 0; i < input.length(); i += Character.charCount(cp)) {
 85			cp = input.codePointAt(i);
 86			if (builder.offer(cp)) {
 87				needsFinalBuild = true;
 88			} else {
 89				symbols.add(builder.build());
 90				builder = new Builder();
 91				if (builder.offer(cp)) {
 92					needsFinalBuild = true;
 93				}
 94			}
 95		}
 96		if (needsFinalBuild) {
 97			symbols.add(builder.build());
 98		}
 99		return symbols;
100	}
101
102	public static Pattern generatePattern(CharSequence input) {
103		final StringBuilder pattern = new StringBuilder();
104		for(Symbol symbol : parse(input.toString())) {
105			if (symbol instanceof Emoji) {
106				if (pattern.length() != 0) {
107					pattern.append('|');
108				}
109				pattern.append(Pattern.quote(symbol.toString()));
110			}
111		}
112		return Pattern.compile(pattern.toString());
113	}
114
115	public static boolean isEmoji(String input) {
116		List<Symbol> symbols = parse(input);
117		return symbols.size() == 1 && symbols.get(0).isEmoji();
118	}
119
120	public static boolean isOnlyEmoji(String input) {
121		List<Symbol> symbols = parse(input);
122		for(Symbol symbol : symbols) {
123			if (!symbol.isEmoji()) {
124				return false;
125			}
126		}
127		return symbols.size() > 0;
128	}
129
130	private static abstract class Symbol {
131
132		private final String value;
133
134		public Symbol(List<Integer> codepoints) {
135			StringBuilder builder = new StringBuilder();
136			for(Integer codepoint : codepoints) {
137				builder.appendCodePoint(codepoint);
138			}
139			this.value = builder.toString();
140		}
141
142		abstract boolean isEmoji();
143
144		@Override
145		public String toString() {
146			return value;
147		}
148	}
149
150	public static class Emoji extends Symbol {
151
152		public Emoji(List<Integer> codepoints) {
153			super(codepoints);
154		}
155
156		@Override
157		boolean isEmoji() {
158			return true;
159		}
160	}
161
162	public static class Other extends Symbol {
163
164		public Other(List<Integer> codepoints) {
165			super(codepoints);
166		}
167
168		@Override
169		boolean isEmoji() {
170			return false;
171		}
172	}
173
174	private static class Builder {
175		private final List<Integer> codepoints = new ArrayList<>();
176
177
178		public boolean offer(int codepoint) {
179			boolean add = false;
180			if (this.codepoints.size() == 0) {
181				if (SYMBOLIZE.contains(codepoint)) {
182					add = true;
183				} else if (REGIONAL_INDICATORS.contains(codepoint)) {
184					add = true;
185				} else if (EMOJIS.contains(codepoint) && !FITZPATRICK.contains(codepoint) && codepoint != ZWJ) {
186					add = true;
187				}
188			} else {
189				int previous = codepoints.get(codepoints.size() -1);
190				if (codepoints.get(0) == BLACK_FLAG) {
191					add = TAGS.contains(codepoint);
192				} else if (COMBINING_ENCLOSING_KEYCAP == codepoint) {
193					add = KEYCAP_COMBINEABLE.contains(previous) || previous == VARIATION_16;
194				} else if (SYMBOLIZE.contains(previous)) {
195					add = codepoint == VARIATION_16;
196				} else if (REGIONAL_INDICATORS.contains(previous) && REGIONAL_INDICATORS.contains(codepoint)) {
197					add = codepoints.size() == 1;
198				} else if (previous == VARIATION_16) {
199					add = isMerger(codepoint);
200				} else if (FITZPATRICK.contains(previous)) {
201					add = codepoint == ZWJ;
202				} else if (ZWJ == previous) {
203					add = EMOJIS.contains(codepoint);
204				} else if (isMerger(codepoint)) {
205					add = true;
206				} else if (codepoint == VARIATION_16 && EMOJIS.contains(previous)) {
207					add = true;
208				}
209			}
210			if (add) {
211				codepoints.add(codepoint);
212				return true;
213			} else {
214				return false;
215			}
216		}
217
218		private static boolean isMerger(int codepoint) {
219			return codepoint == ZWJ || FITZPATRICK.contains(codepoint);
220		}
221
222		public Symbol build() {
223			if (codepoints.size() > 0 && SYMBOLIZE.contains(codepoints.get(codepoints.size() - 1))) {
224				return new Other(codepoints);
225			} else if (codepoints.size() > 1 && KEYCAP_COMBINEABLE.contains(codepoints.get(0)) && codepoints.get(codepoints.size() - 1) != COMBINING_ENCLOSING_KEYCAP) {
226				return new Other(codepoints);
227			}
228			return codepoints.size() == 0 ? new Other(codepoints): new Emoji(codepoints);
229		}
230	}
231
232	public static class UnicodeBlocks implements UnicodeSet {
233		final UnicodeSet[] unicodeSets;
234
235		public UnicodeBlocks(UnicodeSet... sets) {
236			this.unicodeSets = sets;
237		}
238
239		@Override
240		public boolean contains(int codepoint) {
241			for(UnicodeSet unicodeSet : unicodeSets) {
242				if (unicodeSet.contains(codepoint)) {
243					return true;
244				}
245			}
246			return false;
247		}
248	}
249
250	public interface UnicodeSet {
251		boolean contains(int codepoint);
252	}
253
254	public static class UnicodeList implements UnicodeSet {
255
256		private final List<Integer> list;
257
258		public UnicodeList(Integer... codes) {
259			this.list = Arrays.asList(codes);
260		}
261
262		@Override
263		public boolean contains(int codepoint) {
264			return this.list.contains(codepoint);
265		}
266	}
267
268
269	public static class UnicodeRange implements UnicodeSet {
270
271		private final int lower;
272		private final int upper;
273
274		UnicodeRange(int lower, int upper) {
275			this.lower = lower;
276			this.upper = upper;
277		}
278
279		public boolean contains(int codePoint) {
280			return codePoint >= lower && codePoint <= upper;
281		}
282	}
283}