Java
(      Unicode)

             2011 4    16
            twitter: @zaki50
Who am I

•          YAMAZAKI Makoto(twitter: @zaki50)
•   Android

    •
    •   StickyShortcut
•           Java
•           (CharacterSet)
    (Encoding)

•

• UTF-16

• UTF-8
•   Unicode 6.0
    11
•
• US ASCII
• Shift JIS
• JIS X 208
• UCS-2
• UCS-4
•
                        Unicode

    • UTF-8
    • UTF-16
    • UTF-32
    • ...
               (   : US ASCII, Shift JIS)
Unicode
Unicode
•



• Xerox              Microsoft, Apple, Sun
    Microsystems, HP, JUST System
          The Unicode Consortium

•         iso10646
Unicode iso10646


•

•
    Unicode
Unicode
•       The Unicode Consortium
    ( http://www.unicode.org/ )

•



•
•       Unicode




    •

    •
(            , Ligature)
•



    •   (U+3075) +   (U+309A) =

    •
                (             (U+3077) )
Unicode

• Unicode




  Unicode
• NFC(Normalization Form C)
 • NFD(Normalization Form D)
 • NFKC(Normalization Form KC)
 • NFKD(Normalization Form KD)


C(Composition,   )/D(Decomposition,   )
          K(Compatibility,   )
•      NFC

• MacOS X          (HFS+)
             NFD

• NFKC, NFKD
Unicode
• C(Composition)




• D(Decomposition)
Unicode
• K(Compatibility)


         1




     :       (U+3000)        (U+0020)
             (     )    (5   )
Unicode

     K        K




C




D
Eclipse
UTF-16
UTF-16

•

    • Java   String

    • Windows
CodePoint                       UTF-16




U+0000-U+FFFF xxxx xxxx xxxx xxxx       xxxx xxxx xxxx xxxx



 U+010000-U      0000 0000 000u uuuu   1101 10ww wwxx xxxx
 +0010FFFF       xxxx xxxx xxxx xxxx    1101 11xx xxxx xxxx

                                           x,u,w ∈ {0,1}
                                         wwww = uuuuu - 1
UTF-16


•   2

•   2
• 1     16       16bit * 2

 • 1         16bit                16
                      20bit

 • U+D800-U+DFFF
        (11bit                )

 • 0xD800-0xDBFF, 0xDC00-xDFFF
• UTF-16       2       1


       2

U+3000 = 0x3000       0x30, 0x00   UTF-16BE
U+3000 = 0x3000       0x00, 0x30   UTF-16LE
BOM(byte order mark)
•                  U+FEFF



    • U+FFFE

    •                    U+FEFF
               ZERO WIDTH NON-
        BREAKING SPAEC
UTF-8
CodePoint                          bit                         bit

   U+00-U+7F                         0xxx xxx                  7bits

U+0080-U+07FF                   110y yyyx 10xx xxxx            11bits

U+0800-U+FFFF           1110 yyyy 10yxx xxxx (10xx xxxx) * 1   16bits

 U+010000-U+1FFFFF      1111 0yyy 10yy xxxx (10xx xxxx) * 2    21bits

U+00200000-U+03FFFFFF   1111 10yy 10yy yxxx (10xx xxxx) * 3    26bits

U+04000000-U+7FFFFFFF   1111 110y 10yy yyxx (10xx xxxx) * 4    31bits

                  x,y ∈ {0,1}                                  y    1
UTF-8
• US ASCII                US
    ASCII

•

    • 0x80-0xbf   2   ,



• 2
•



    •
• UTF-8    1




•         BOM
BOM(byte order mark)
•                   0xEF, 0xBB, 0xBF



• byte order mark



• UTF-8
1-6          2^31
UTF-8
         (1-4)        (2^21)


UTF-16   2-4     2^16-2*2^10+2^20



UTF-32    4         2^16+2^20
•             (CharacterSet)       (Encoding)



•             1



•             1
    (UTF-16                    )     2
• The Unicode Consortium(http://www.unicode.org/)
  • Unicode 6.0.0(http://www.unicode.org/versions/Unicode6.0.0/)
• WikiPedia
  • Unicode(http://ja.wikipedia.org/wiki/Unicode)
  • UTF-8(http://ja.wikipedia.org/wiki/UTF-8)
  • UTF-16(http://ja.wikipedia.org/wiki/UTF-16)
• Unicode                 (http://homepage1.nifty.com/nomenclator/
  unicode/normalization.htm)
(   )
1
                                   Unicode
package org.zakky.gudanama;

import java.text.Normalizer;

public class NormalizeMain {

                                                                                   private static void printCodePoints(String str) {
    public static void main(String[] args) {
                                                                                       StringBuilder sb = new StringBuilder();
        // [ ϓ ], [ ϔ ], [ ẛ ]
                                                                                       int index = 0;
        final String src = "u03d3u03d4u1e9b";
                                                                                       while(index < str.length()) {
        printCodePoints(src);
                                                                                           char c = str.charAt(index);
                                                                                           sb.append("U+");
       // [ ϓ ], [ ϔ ], [ ẛ ]
                                                                                           if (Character.isHighSurrogate(c)) {
       System.out.print("NFC: ");
                                                                                               sb.append(Character.toCodePoint(c, str.charAt(index+1)));
       printCodePoints(Normalizer.normalize(src,     Normalizer.Form.NFC));
                                                                                               index++;
       // [ ϒ ], [ ́ ], [ ϒ ], [ ̈ ], [ ſ ], [ ̇ ]
                                                                                           } else {
       System.out.print("NFD: ");
                                                                                               sb.append((int) c);
       printCodePoints(Normalizer.normalize(src,     Normalizer.Form.NFD));
                                                                                           }
       // [ Ύ ], [ Ϋ ], [ ṡ ]
                                                                                           index++;
       System.out.print("NFKC: ");
                                                                                           sb.append(' ');
       printCodePoints(Normalizer.normalize(src,     Normalizer.Form.NFKC));
                                                                                       }
       // [ Υ ], [ ́ ], [ Υ ], [ ̈ ], [ s ], [ ̇ ]
                                                                                       if (sb.length() != 0) {
       System.out.print("NFKD: ");
                                                                                           sb.setLength(sb.length() - 1);
       printCodePoints(Normalizer.normalize(src,     Normalizer.Form.NFKD));
                                                                                       }
                                                                                       System.out.println(sb);
   }
                                                                                   }
                                                                               }
2

package org.zakky.gudanama;

public class SurrogatePairMain {

    public static void main(String[] args) {
        final String str = " ";
        System.out.println("str: " + str);
        System.out.println("length: " + str.length());
        System.out.println("code point count" + str.codePointCount(0, str.length()));
        System.out.println("code point of str: "
                + Character.toCodePoint(str.charAt(0), str.charAt(1)) + "(U+"
                + Integer.toHexString(Character.toCodePoint(str.charAt(0), str.charAt(1))) + ")");
        System.out.println("str[0] is high surrogate: " + Character.isHighSurrogate(str.charAt(0)));
        System.out.println("str[0] is low surrogate: " + Character.isLowSurrogate(str.charAt(0)));
        System.out.println("str[1] is high surrogate: " + Character.isHighSurrogate(str.charAt(1)));
        System.out.println("str[1] is low surrogate: " + Character.isLowSurrogate(str.charAt(1)));
        System.out.println(new String(new char[] {str.charAt(0), str.charAt(1)
        }));
    }
}
3
                         BOM
package org.zakky.gudanama;

import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;

public class BomMain {

    public static void main(String[] args) throws IOException {

        Reader r = null;

        // skip bom
        int first = r.read();
        if (!isBom(first)) {
            PushbackReader pushbackReader = new PushbackReader(r);
            pushbackReader.unread(first);
            r = pushbackReader;
        }

    }

    private static boolean isBom(int codepoint) {
        return codepoint == 0xfeff;
    }
}