View Javadoc
1   package emissary.util;
2   
3   import emissary.test.core.junit5.UnitTest;
4   
5   import org.junit.jupiter.api.Test;
6   
7   import java.util.List;
8   
9   import static org.junit.jupiter.api.Assertions.assertFalse;
10  import static org.junit.jupiter.api.Assertions.assertTrue;
11  
12  class CharsetUtilTest extends UnitTest {
13      // Strings from wikipedia, 2017-07-18
14      // If this test fails, (why else would you be looking here?)
15      // one thing to look at is to ensure "javac -encoding=utf8"
16      // is being turned on so that these strings make it into the
17      // .class file as utf8.
18      public static final List<String> S = List.of("L'ordinateur à 100$ bientôt sur le marché ?",
19              "Um dicionário é uma compilação de palavras ou dos termos próprios",
20              "Un dictionnaire est un ouvrage de référence contenant l’ensemble des mots d’une langue",
21              "Sözlük bir dilin veya dillerin kelime haznesini", "القاموس هو أداة لجمع كلمات لغة ما وتعريفها وشرحها",
22              "Слова́рь это книга, информация в которой упорядочена c помощью разбивки на небольшие статьи, отсортированные по названию или тематике",
23              "稱辭典,是為詞語提供音韻、釋義、例句用法等等的工具書。", "Từ điển là danh sách các từ, ngữ được sắp xếp thành các từ vị chuẩn",
24              "Un dicționar sau lexicon este o lucrare lexicografică care cuprinde o parte semnificativă dintre cuvintele unei limbi",
25              "د مڼې، پیاز او الو خوند یو ډول دی، مګر د مختلف ډوله وږم لرلو سره د دوي خوند مختلف ښکاري");
26  
27      @Test
28      void testUTF8() {
29          assertTrue(CharsetUtil.isUtf8("This is a test."), "Ascii is utf8");
30          assertTrue(CharsetUtil.isUtf8("!@#$%^&*(F)=+-_[]{}\\|'\";:,.></?`~"), "Punctuation is utf8");
31          assertTrue(CharsetUtil.isUtf8("0123456789 9876543210"), "Numbers are utf8");
32  
33          assertTrue(CharsetUtil.isUtf8("This is a bytes array test. 123 #$%".getBytes()), "Ascii bytes are utf8");
34  
35          for (int i = 0; i < S.size(); i++) {
36              assertTrue(CharsetUtil.isUtf8(S.get(i)),
37                      "Foreign strings from java, entry " + i + " of " + S.size() + " == " + S.get(i) + "/" + S.get(i).length());
38              assertTrue(CharsetUtil.isUtf8(S.get(i).getBytes()),
39                      "Foreign bytes from java, entry " + i + " of " + S.size() + " == " + S.get(i) + "/" + S.get(i).getBytes().length);
40          }
41      }
42  
43      @Test
44      void testMultibyte() {
45          for (int i = 0; i < S.size(); i++) {
46              assertTrue(CharsetUtil.hasMultibyte(S.get(i)),
47                      "Foreign strings have multibyte, entry " + i + " of " + S.size() + " == " + S.get(i) + "/" + S.get(i).length());
48          }
49  
50          assertFalse(CharsetUtil.hasMultibyte(null), "Null does not have multibyte");
51  
52          assertFalse(CharsetUtil.hasMultibyte(""), "Empty does not have multibyte");
53  
54          assertFalse(CharsetUtil.hasMultibyte("1234 abcde !@#$%"), "Ascii does not have multibyte");
55      }
56  
57      @Test
58      void testNotUTF8() {
59          byte[] b = new byte[] {(byte) 192, (byte) 192, (byte) 224, (byte) 224, (byte) 192, (byte) 0, (byte) 192, (byte) 224};
60          assertFalse(CharsetUtil.isUtf8(b), "Bad utf-8 stream is not utf-8");
61      }
62  
63      @Test
64      void testIsAscii() {
65          assertTrue(CharsetUtil.isAscii("abcdefg 1234567"), "Ascii is easy");
66          assertFalse(CharsetUtil.isAscii("Шарифа"), "This is not ascii");
67      }
68  
69  }