View Javadoc
1   package emissary.test.util;
2   
3   import java.util.HashMap;
4   import java.util.Map;
5   
6   /**
7    * A class that provides some tricky samples. These samples can be used in testing to make sure our code and the 3rd
8    * party libraries we choose can handle unusual cases.
9    * <p>
10   * Each example contains detailed explanation and links to useful reference materials.
11   */
12  public final class ComplexUnicodeSamples {
13  
14      private ComplexUnicodeSamples() {}
15  
16      /**
17       * Returns a string that contains one graphical unit (in this case an emoji) that consists of 5 Unicode scalar values.
18       * The user-perceived string would be one facepalming emoji. A user would expect hit the arrow key once to traverse the
19       * cursor across this one emoji on the screen. The length of the UTF-8 encoded byte array is 17 bytes. One emoji, 17
20       * UTF8 bytes.
21       * <p>
22       * SCALAR 1: First, there’s a base character that means a person face palming.
23       * <p>
24       * SCALAR 2: By default, the person would have a cartoonish yellow color. The next character is an emoji skintone
25       * modifier the changes the color of the person’s skin (and, in practice, also the color of the person’s hair).
26       * <p>
27       * SCALAR 3 and 4: By default, the gender of the person is undefined, and e.g. Apple defaults to what they consider a
28       * male appearance and e.g. Google defaults to what they consider a female appearance. The next two scalar values pick a
29       * male-typical appearance specifically regardless of font and vendor. Instead of being an emoji-specific modifier like
30       * the skin tone, the gender specification uses an emoji-predating gender symbol (MALE SIGN) explicitly ligated using
31       * the ZERO WIDTH JOINER with the (skin-toned) face-palming person. (Whether it is a good or a bad idea that the skin
32       * tone and gender specifications use different mechanisms is out of the scope of this post.)
33       * <p>
34       * SCALAR 5: Finally, VARIATION SELECTOR-16 makes it explicit that we want a multicolor emoji rendering instead of a
35       * monochrome dingbat rendering.
36       * 
37       * @return the Java string containing this one facepalming dude emoji with a not-yellow skin tone.
38       * 
39       * @see ComplexUnicodeSamplesTest#demonstrateMetadataAboutFacePalmDude()
40       * @see <a href="https://hsivonen.fi/string-length/">https://hsivonen.fi/string-length/</a>
41       */
42      public static String getFacePalmingMaleControlSkintone() {
43  
44          StringBuilder sb = new StringBuilder();
45  
46          // SCALAR 1: U+1F926 FACE PALM
47          // Use the lookup for how to represent in java
48          // https://www.fileformat.info/info/unicode/char/1f926/index.htm
49          // UTF-32 code units: 1
50          // UTF-16 code units: 2
51          // UTF-8 code units: 4
52          // UTF-32 bytes: 4
53          // UTF-16 bytes: 4
54          // UTF-8 bytes: 4
55          sb.append("\uD83E\uDD26");
56  
57          // SCALAR 2: U+1F3FC EMOJI MODIFIER FITZPATRICK TYPE-3
58          // https://www.fileformat.info/info/unicode/char/1f3fc/index.htm
59          // UTF-32 code units: 1
60          // UTF-16 code units: 2
61          // UTF-8 code units: 4
62          // UTF-32 bytes: 4
63          // UTF-16 bytes: 4
64          // UTF-8 bytes: 4
65          sb.append("\uD83C\uDFFC");
66  
67          // SCALAR 3: U+200D ZERO WIDTH JOINER
68          // UTF-32 code units: 1
69          // UTF-16 code units: 1
70          // UTF-8 code units: 3
71          // UTF-32 bytes: 4
72          // UTF-16 bytes: 2
73          // UTF-8 bytes: 3
74          sb.append("\u200D");
75  
76          // SCALAR 4: U+2642 MALE SIGN
77          // UTF-32 code units: 1
78          // UTF-16 code units: 1
79          // UTF-8 code units: 3
80          // UTF-32 bytes: 4
81          // UTF-16 bytes: 2
82          // UTF-8 bytes: 3
83          sb.append("\u2642");
84  
85          // SCALAR 5: U+FE0F VARIATION SELECTOR-16
86          // UTF-32 code units: 1
87          // UTF-16 code units: 1
88          // UTF-8 code units: 3
89          // UTF-32 bytes: 4
90          // UTF-16 bytes: 2
91          // UTF-8 bytes: 3
92          sb.append("\uFE0F");
93  
94          return sb.toString();
95      }
96  
97  
98      /**
99       * This map is useful for testing that our code and any 3rd party XML library we are using is handling unicode within
100      * XML correctly.
101      * 
102      * @return A map of strings where the key is the XML node containing an XML-escaped surrogate pair unicode value and the
103      *         value is is the properly extracted java string value with un-escaped unicode strings.
104      * @see <a href=
105      *      "https://github.com/FasterXML/woodstox/pull/174/files">https://github.com/FasterXML/woodstox/pull/174/files</a>
106      */
107     public static Map<String, String> getXmlSamples() {
108         // See https://github.com/FasterXML/woodstox/pull/174/files
109         Map<String, String> xmlWithExp = new HashMap<String, String>();
110         // Numeric surrogate pairs
111         xmlWithExp.put("<root>surrogate pair: &#55356;&#57221;.</root>",
112                 "surrogate pair: \uD83C\uDF85.");
113         // Hex and numeric surrogate pairs
114         xmlWithExp.put("<root>surrogate pair: &#xD83C;&#57221;.</root>",
115                 "surrogate pair: \uD83C\uDF85.");
116         // Numeric and hex surrogate pairs
117         xmlWithExp.put("<root>surrogate pair: &#55356;&#xDF85;.</root>",
118                 "surrogate pair: \uD83C\uDF85.");
119         // Hex surrogate pairs
120         xmlWithExp.put("<root>surrogate pair: &#xD83C;&#xDF85;.</root>",
121                 "surrogate pair: \uD83C\uDF85.");
122         // Two surrogate pairs
123         xmlWithExp.put("<root>surrogate pair: &#55356;&#57221;&#55356;&#57220;.</root>",
124                 "surrogate pair: \uD83C\uDF85\uD83C\uDF84.");
125         // Surrogate pair and simple entity
126         xmlWithExp.put("<root>surrogate pair: &#55356;&#57221;&#8482;.</root>",
127                 "surrogate pair: \uD83C\uDF85\u2122.");
128 
129         return xmlWithExp;
130     }
131 
132     /**
133      * This will not work properly in versions of java earlier than Java 20.
134      * <p>
135      * Once we get to Java 20, this method should work properly.
136      * <p>
137      * Character boundary analysis allows users to interact with characters as they expect to, for example, when moving the
138      * cursor through a text string. Character boundary analysis provides correct navigation through character strings,
139      * regardless of how the character is stored. The boundaries returned may be those of supplementary characters,
140      * combining character sequences, or ligature clusters. For example, an accented character might be stored as a base
141      * character and a diacritical mark. What users consider to be a character can differ between languages.
142      * 
143      * @see <a href=
144      *      "https://horstmann.com/unblog/2023-10-03/index.html">https://horstmann.com/unblog/2023-10-03/index.html</a> -
145      *      Scroll to the section titled "Just Use Strings"
146      *
147      * @param text - the string to analyze.
148      * @return the count of user-perceived graphemes as based on the character break iterator. In versions of java earlier
149      *         than Java 20, this will not function as expected.
150      */
151     public static int countGraphemesUsingJavaBuiltInBreakIterator(String text) {
152 
153         java.text.BreakIterator breakIterator = java.text.BreakIterator.getCharacterInstance();
154         breakIterator.setText(text);
155 
156         int count = 0;
157         for (int end = breakIterator.next(); end != java.text.BreakIterator.DONE; end = breakIterator.next()) {
158             count++;
159         }
160 
161         return count;
162     }
163 
164     /**
165      * Using the industry-standard ICU4J library provided by IBM.
166      * <p>
167      * NOTE: Updating the version of this library might change which unicode database is referenced for these calculations.
168      * We should strive to keep this library as up-to-date as possible in both test and production source code.
169      * 
170      * @param text the string to analyze
171      * @return a count of how many user-perceived glyphs/graphemes are present in the string. If you placed a cursor diretly
172      *         to the left (or right for right-to-left string), and pressed the arrow key to traverse the string, how many
173      *         times would you need to press the arrow key to traverse to the right-most end of the string (or leftmost for
174      *         R-to-L strings).
175      */
176     public static int countGraphemesUsingIcu4J(String text) {
177         com.ibm.icu.text.BreakIterator breakIterator = com.ibm.icu.text.BreakIterator.getCharacterInstance();
178         breakIterator.setText(text);
179 
180         int count = 0;
181         for (int end = breakIterator.next(); end != com.ibm.icu.text.BreakIterator.DONE; end = breakIterator.next()) {
182             count++;
183         }
184 
185         return count;
186     }
187 
188 }