View Javadoc
1   package emissary.test.util;
2   
3   import com.ibm.icu.text.Normalizer2;
4   import org.junit.jupiter.api.Test;
5   import org.junit.jupiter.api.condition.DisabledForJreRange;
6   import org.junit.jupiter.api.condition.EnabledForJreRange;
7   import org.junit.jupiter.api.condition.JRE;
8   
9   import java.nio.charset.Charset;
10  import java.nio.charset.StandardCharsets;
11  
12  import static org.junit.jupiter.api.Assertions.assertEquals;
13  import static org.junit.jupiter.api.Assertions.assertNotEquals;
14  import static org.junit.jupiter.api.Assertions.assertTrue;
15  
16  class ComplexUnicodeSamplesTest {
17  
18      /**
19       * Interesting observations about face palm dude emoji.
20       * <p>
21       * We’ve seen four different lengths so far:
22       * 
23       * <ul>
24       * <li>Number of UTF-8 code units (17 in this case)</li>
25       * <li>Number of UTF-16 code units (7 in this case)</li>
26       * <li>Number of UTF-32 code units or Unicode scalar values (5 in this case)</li>
27       * <li>Number of extended grapheme clusters (1 in this case)</li>
28       * </ul>
29       * Given a valid Unicode string and a version of Unicode, all of the above are well-defined and it holds that each item
30       * higher on the list is greater or equal than the items lower on the list.
31       * <p>
32       * One of these is not like the others, though: The first three numbers have an unchanging definition for any valid
33       * Unicode string whether it contains currently assigned scalar values or whether it is from the future and contains
34       * unassigned scalar values as far as software written today is aware. Also, computing the first three lengths does not
35       * involve lookups from the Unicode database. However, the last item depends on the Unicode version and involves lookups
36       * from the Unicode database. If a string contains scalar values that are unassigned as far as the copy of the Unicode
37       * database that the program is using is aware, the program will potentially overcount extended grapheme clusters in the
38       * string compared to a program whose copy of the Unicode database is newer and has assignments for those scalar values
39       * (and some of those assignments turn out to be combining characters).
40       */
41      @Test
42      void demonstrateMetadataAboutFacePalmDude() {
43  
44          String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
45  
46          // SCALAR 1 is 4 UTF8 bytes
47          // SCALAR 2 is 4 UTF8 bytes
48          // SCALAR 3 is 3 UTF8 bytes
49          // SCALAR 4 is 3 UTF8 bytes
50          // SCALAR 5 is 3 UTF8 bytes
51          // TOTAL : 17 UTF8 bytes
52          assertEquals(17, facepalm.getBytes(StandardCharsets.UTF_8).length);
53          assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_8)));
54  
55          // SCALAR 1 is 4 UTF16 bytes
56          // SCALAR 2 is 4 UTF16 bytes
57          // SCALAR 3 is 2 UTF16 bytes
58          // SCALAR 4 is 2 UTF16 bytes
59          // SCALAR 5 is 2 UTF16 bytes
60          // TOTAL : 14 UTF16 bytes if no BOM is needed
61          // Java typically defaults to UTF-16BE
62          assertEquals(14, facepalm.getBytes(StandardCharsets.UTF_16BE).length);
63          assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_16BE), StandardCharsets.UTF_16BE));
64          assertEquals(14, facepalm.getBytes(StandardCharsets.UTF_16LE).length);
65          assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_16LE), StandardCharsets.UTF_16LE));
66  
67          // When the endianness isn't specified, 2 bytes are used for the byte order marker
68          // The BOM is a special character (U+FEFF) used to indicate the endianness (byte order)
69          // of a UTF-16 encoded file or stream. In UTF-16, the BOM can be either:
70          // FE FF (Big Endian)
71          // FF FE (Little Endian)
72          assertEquals(16, facepalm.getBytes(StandardCharsets.UTF_16).length);
73          assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_16), StandardCharsets.UTF_16));
74  
75          // 5 UTF-32 characters at 4 bytes per character
76          assertEquals(20, facepalm.getBytes(Charset.forName("UTF-32")).length);
77          assertEquals(facepalm, new String(facepalm.getBytes(Charset.forName("UTF-32")), Charset.forName("UTF-32")));
78  
79          // single byte encoding is not going to produce what you want
80          assertEquals(5, facepalm.getBytes(StandardCharsets.ISO_8859_1).length);
81          assertNotEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.ISO_8859_1));
82  
83  
84          assertEquals(5, facepalm.codePointCount(0, facepalm.length()));
85  
86          // ICU4J BreakIterator gets it right
87          assertEquals(1, ComplexUnicodeSamples.countGraphemesUsingIcu4J(facepalm));
88  
89          // See
90          // demonstrateMetadataAboutFacePalmDudeForJava20()
91          // and
92          // demonstrateMetadataAboutFacePalmDudePriorToJava20()
93          // to see how using the intrinsic java BreakIterator doesn't
94          // get it right until Java 20.
95  
96  
97          // It's already normalized in it's natural form.
98          Normalizer2 nfcDecomp = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE);
99          Normalizer2 nfkcDecomp = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
100         assertTrue(nfcDecomp.isNormalized(facepalm));
101         assertTrue(nfkcDecomp.isNormalized(facepalm));
102 
103         Normalizer2 nfcComp = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE);
104         Normalizer2 nfkcComp = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE);
105         assertTrue(nfcComp.isNormalized(facepalm));
106         assertTrue(nfkcComp.isNormalized(facepalm));
107 
108     }
109 
110     @Test
111     @EnabledForJreRange(min = JRE.JAVA_20, disabledReason = "This test only valid for Java 20 and later.")
112     void demonstrateMetadataAboutFacePalmDudeForJava20() {
113         String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
114         assertEquals(1, ComplexUnicodeSamples.countGraphemesUsingJavaBuiltInBreakIterator(facepalm));
115     }
116 
117     @Test
118     @DisabledForJreRange(min = JRE.JAVA_20, disabledReason = "This test only valid for Java versions up to not including Java 20.")
119     void demonstrateMetadataAboutFacePalmDudePriorToJava20() {
120         String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
121         assertEquals(4, ComplexUnicodeSamples.countGraphemesUsingJavaBuiltInBreakIterator(facepalm));
122         // it should be 1, but it's wrong until Java 20.
123     }
124 
125     @Test
126     @EnabledForJreRange(min = JRE.JAVA_17, disabledReason = "This test only valid for Java 17 and later.")
127     void demonstrateMetadataAboutFacePalmDudeForJava17AndLater() {
128         String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
129         int j = 27;
130         assertEquals(j, facepalm.repeat(j).split("\\b{g}").length);
131     }
132 
133     @Test
134     @DisabledForJreRange(min = JRE.JAVA_17, disabledReason = "This test only valid for Java versions up to not including Java 17.")
135     void demonstrateMetadataAboutFacePalmDudePriorToJava17() {
136         String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
137         int j = 27;
138         assertEquals(j * 3, facepalm.repeat(j).split("\\b{g}").length);
139         // it should be 27, but it's wrong until Java 17
140     }
141 
142 
143 }