1 package emissary.test.util; 2 3 import java.util.HashMap; 4 import java.util.Map; 5 6 /** 7 * A class that provides some tricky samples. These samples can be used in testing to make sure our code and the 3rd 8 * party libraries we choose can handle unusual cases. 9 * <p> 10 * Each example contains detailed explanation and links to useful reference materials. 11 */ 12 public final class ComplexUnicodeSamples { 13 14 private ComplexUnicodeSamples() {} 15 16 /** 17 * Returns a string that contains one graphical unit (in this case an emoji) that consists of 5 Unicode scalar values. 18 * The user-perceived string would be one facepalming emoji. A user would expect hit the arrow key once to traverse the 19 * cursor across this one emoji on the screen. The length of the UTF-8 encoded byte array is 17 bytes. One emoji, 17 20 * UTF8 bytes. 21 * <p> 22 * SCALAR 1: First, there’s a base character that means a person face palming. 23 * <p> 24 * SCALAR 2: By default, the person would have a cartoonish yellow color. The next character is an emoji skintone 25 * modifier the changes the color of the person’s skin (and, in practice, also the color of the person’s hair). 26 * <p> 27 * SCALAR 3 and 4: By default, the gender of the person is undefined, and e.g. Apple defaults to what they consider a 28 * male appearance and e.g. Google defaults to what they consider a female appearance. The next two scalar values pick a 29 * male-typical appearance specifically regardless of font and vendor. Instead of being an emoji-specific modifier like 30 * the skin tone, the gender specification uses an emoji-predating gender symbol (MALE SIGN) explicitly ligated using 31 * the ZERO WIDTH JOINER with the (skin-toned) face-palming person. (Whether it is a good or a bad idea that the skin 32 * tone and gender specifications use different mechanisms is out of the scope of this post.) 33 * <p> 34 * SCALAR 5: Finally, VARIATION SELECTOR-16 makes it explicit that we want a multicolor emoji rendering instead of a 35 * monochrome dingbat rendering. 36 * 37 * @return the Java string containing this one facepalming dude emoji with a not-yellow skin tone. 38 * 39 * @see ComplexUnicodeSamplesTest#demonstrateMetadataAboutFacePalmDude() 40 * @see <a href="https://hsivonen.fi/string-length/">https://hsivonen.fi/string-length/</a> 41 */ 42 public static String getFacePalmingMaleControlSkintone() { 43 44 StringBuilder sb = new StringBuilder(); 45 46 // SCALAR 1: U+1F926 FACE PALM 47 // Use the lookup for how to represent in java 48 // https://www.fileformat.info/info/unicode/char/1f926/index.htm 49 // UTF-32 code units: 1 50 // UTF-16 code units: 2 51 // UTF-8 code units: 4 52 // UTF-32 bytes: 4 53 // UTF-16 bytes: 4 54 // UTF-8 bytes: 4 55 sb.append("\uD83E\uDD26"); 56 57 // SCALAR 2: U+1F3FC EMOJI MODIFIER FITZPATRICK TYPE-3 58 // https://www.fileformat.info/info/unicode/char/1f3fc/index.htm 59 // UTF-32 code units: 1 60 // UTF-16 code units: 2 61 // UTF-8 code units: 4 62 // UTF-32 bytes: 4 63 // UTF-16 bytes: 4 64 // UTF-8 bytes: 4 65 sb.append("\uD83C\uDFFC"); 66 67 // SCALAR 3: U+200D ZERO WIDTH JOINER 68 // UTF-32 code units: 1 69 // UTF-16 code units: 1 70 // UTF-8 code units: 3 71 // UTF-32 bytes: 4 72 // UTF-16 bytes: 2 73 // UTF-8 bytes: 3 74 sb.append("\u200D"); 75 76 // SCALAR 4: U+2642 MALE SIGN 77 // UTF-32 code units: 1 78 // UTF-16 code units: 1 79 // UTF-8 code units: 3 80 // UTF-32 bytes: 4 81 // UTF-16 bytes: 2 82 // UTF-8 bytes: 3 83 sb.append("\u2642"); 84 85 // SCALAR 5: U+FE0F VARIATION SELECTOR-16 86 // UTF-32 code units: 1 87 // UTF-16 code units: 1 88 // UTF-8 code units: 3 89 // UTF-32 bytes: 4 90 // UTF-16 bytes: 2 91 // UTF-8 bytes: 3 92 sb.append("\uFE0F"); 93 94 return sb.toString(); 95 } 96 97 98 /** 99 * This map is useful for testing that our code and any 3rd party XML library we are using is handling unicode within 100 * XML correctly. 101 * 102 * @return A map of strings where the key is the XML node containing an XML-escaped surrogate pair unicode value and the 103 * value is is the properly extracted java string value with un-escaped unicode strings. 104 * @see <a href= 105 * "https://github.com/FasterXML/woodstox/pull/174/files">https://github.com/FasterXML/woodstox/pull/174/files</a> 106 */ 107 public static Map<String, String> getXmlSamples() { 108 // See https://github.com/FasterXML/woodstox/pull/174/files 109 Map<String, String> xmlWithExp = new HashMap<String, String>(); 110 // Numeric surrogate pairs 111 xmlWithExp.put("<root>surrogate pair: ��.</root>", 112 "surrogate pair: \uD83C\uDF85."); 113 // Hex and numeric surrogate pairs 114 xmlWithExp.put("<root>surrogate pair: ��.</root>", 115 "surrogate pair: \uD83C\uDF85."); 116 // Numeric and hex surrogate pairs 117 xmlWithExp.put("<root>surrogate pair: ��.</root>", 118 "surrogate pair: \uD83C\uDF85."); 119 // Hex surrogate pairs 120 xmlWithExp.put("<root>surrogate pair: ��.</root>", 121 "surrogate pair: \uD83C\uDF85."); 122 // Two surrogate pairs 123 xmlWithExp.put("<root>surrogate pair: ����.</root>", 124 "surrogate pair: \uD83C\uDF85\uD83C\uDF84."); 125 // Surrogate pair and simple entity 126 xmlWithExp.put("<root>surrogate pair: ��™.</root>", 127 "surrogate pair: \uD83C\uDF85\u2122."); 128 129 return xmlWithExp; 130 } 131 132 /** 133 * This will not work properly in versions of java earlier than Java 20. 134 * <p> 135 * Once we get to Java 20, this method should work properly. 136 * <p> 137 * Character boundary analysis allows users to interact with characters as they expect to, for example, when moving the 138 * cursor through a text string. Character boundary analysis provides correct navigation through character strings, 139 * regardless of how the character is stored. The boundaries returned may be those of supplementary characters, 140 * combining character sequences, or ligature clusters. For example, an accented character might be stored as a base 141 * character and a diacritical mark. What users consider to be a character can differ between languages. 142 * 143 * @see <a href= 144 * "https://horstmann.com/unblog/2023-10-03/index.html">https://horstmann.com/unblog/2023-10-03/index.html</a> - 145 * Scroll to the section titled "Just Use Strings" 146 * 147 * @param text - the string to analyze. 148 * @return the count of user-perceived graphemes as based on the character break iterator. In versions of java earlier 149 * than Java 20, this will not function as expected. 150 */ 151 public static int countGraphemesUsingJavaBuiltInBreakIterator(String text) { 152 153 java.text.BreakIterator breakIterator = java.text.BreakIterator.getCharacterInstance(); 154 breakIterator.setText(text); 155 156 int count = 0; 157 for (int end = breakIterator.next(); end != java.text.BreakIterator.DONE; end = breakIterator.next()) { 158 count++; 159 } 160 161 return count; 162 } 163 164 /** 165 * Using the industry-standard ICU4J library provided by IBM. 166 * <p> 167 * NOTE: Updating the version of this library might change which unicode database is referenced for these calculations. 168 * We should strive to keep this library as up-to-date as possible in both test and production source code. 169 * 170 * @param text the string to analyze 171 * @return a count of how many user-perceived glyphs/graphemes are present in the string. If you placed a cursor diretly 172 * to the left (or right for right-to-left string), and pressed the arrow key to traverse the string, how many 173 * times would you need to press the arrow key to traverse to the right-most end of the string (or leftmost for 174 * R-to-L strings). 175 */ 176 public static int countGraphemesUsingIcu4J(String text) { 177 com.ibm.icu.text.BreakIterator breakIterator = com.ibm.icu.text.BreakIterator.getCharacterInstance(); 178 breakIterator.setText(text); 179 180 int count = 0; 181 for (int end = breakIterator.next(); end != com.ibm.icu.text.BreakIterator.DONE; end = breakIterator.next()) { 182 count++; 183 } 184 185 return count; 186 } 187 188 }