So, we needed to do some UTF-8 code matchings. And someone grabbed the proper UTF-8 document and was talking code points.
And then you notice that java 7 has no nice way to determine code points...
*really*, it's only been 7 versions and it's still not well.
At least, a snippet to show the proper code point:
public static String printCharacters(final String s) {
for (final int codepoint : codePoints(s)) {
// we need Character.toChars and new String since we cannot call back to the specific character from the string from the iterator.
System.out.println(new String(Character.toChars(codepoint)) + "is of type: "+ typeToString(Character.getType(codepoint));
}
}
/** java 7 doesn't have a nice support for determining code points. Java 8 does... */
public static Iterable<Integer> codePoints(final String string) {
return new Iterable<Integer>() {
@Override
public Iterator<Integer> iterator() {
return new Iterator<Integer>() {
int nextIndex = 0;
@Override
public boolean hasNext() {
return nextIndex < string.length();
}
@Override
public Integer next() {
final int result = string.codePointAt(nextIndex);
nextIndex += Character.charCount(result);
return result;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
};
}
public static String typeToString(final int type) {
switch (type)
{
case Character.COMBINING_SPACING_MARK:
return "COMBINING_SPACING_MARK [Mc]";
case Character.CURRENCY_SYMBOL:
return "CURRENCY_SYMBOL [Sc]";
case Character.CONNECTOR_PUNCTUATION:
return "CONNECTOR_PUNCTUATION [Pc]";
case Character.CONTROL:
return "CONTROL [Cc]";
case Character.DASH_PUNCTUATION:
return "DASH_PUNCTUATION [Pd]";
case Character.DECIMAL_DIGIT_NUMBER:
return "DECIMAL_DIGIT_NUMBER [Nd]";
case Character.ENCLOSING_MARK:
return "ENCLOSING_MARK [Me]";
case Character.END_PUNCTUATION:
return "END_PUNCTUATION [Pe]";
case Character.FINAL_QUOTE_PUNCTUATION:
return "FINAL_QUOTE_PUNCTUATION [Pf]";
case Character.FORMAT:
return "FORMAT [Cf]";
case Character.INITIAL_QUOTE_PUNCTUATION:
return "INITIAL_QUOTE_PUNCTUATION [Pi]";
case Character.LETTER_NUMBER:
return "LETTER_NUMBER [Nl]";
case Character.LINE_SEPARATOR:
return "LINE_SEPARATOR [Zl]";
case Character.LOWERCASE_LETTER:
return "LOWERCASE_LETTER [Ll]";
case Character.MATH_SYMBOL:
return "MATH_SYMBOL [Sm]";
case Character.MODIFIER_LETTER:
return "MODIFIER_LETTER [Lm]";
case Character.MODIFIER_SYMBOL:
return "MODIFIER_SYMBOL [Sk]";
case Character.NON_SPACING_MARK:
return "NON_SPACING_MARK [Mn]";
case Character.OTHER_LETTER:
return "OTHER_LETTER [Lo]";
case Character.OTHER_NUMBER:
return "OTHER_NUMBER [No]";
case Character.OTHER_PUNCTUATION:
return "OTHER_PUNCTUATION [Po]";
case Character.OTHER_SYMBOL:
return "OTHER_SYMBOL [So]";
case Character.PARAGRAPH_SEPARATOR:
return "PARAGRAPH_SEPARATOR [Zp]";
case Character.PRIVATE_USE:
return "PRIVATE_USE [Co]";
case Character.SPACE_SEPARATOR:
return "SPACE_SEPARATOR [Zs]";
case Character.START_PUNCTUATION:
return "START_PUNCTUATION [Ps]";
case Character.SURROGATE:
return "SURROGATE [Cs]";
case Character.TITLECASE_LETTER:
return "TITLECASE_LETTER [Lt]";
case Character.UNASSIGNED:
return "UNASSIGNED [Cn]";
case Character.UPPERCASE_LETTER:
return "UPPERCASE_LETTER [Lu]";
}
return "dont know";
}
And then you notice that java 7 has no nice way to determine code points...
*really*, it's only been 7 versions and it's still not well.
At least, a snippet to show the proper code point:
public static String printCharacters(final String s) {
for (final int codepoint : codePoints(s)) {
// we need Character.toChars and new String since we cannot call back to the specific character from the string from the iterator.
System.out.println(new String(Character.toChars(codepoint)) + "is of type: "+ typeToString(Character.getType(codepoint));
}
}
/** java 7 doesn't have a nice support for determining code points. Java 8 does... */
public static Iterable<Integer> codePoints(final String string) {
return new Iterable<Integer>() {
@Override
public Iterator<Integer> iterator() {
return new Iterator<Integer>() {
int nextIndex = 0;
@Override
public boolean hasNext() {
return nextIndex < string.length();
}
@Override
public Integer next() {
final int result = string.codePointAt(nextIndex);
nextIndex += Character.charCount(result);
return result;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
};
}
public static String typeToString(final int type) {
switch (type)
{
case Character.COMBINING_SPACING_MARK:
return "COMBINING_SPACING_MARK [Mc]";
case Character.CURRENCY_SYMBOL:
return "CURRENCY_SYMBOL [Sc]";
case Character.CONNECTOR_PUNCTUATION:
return "CONNECTOR_PUNCTUATION [Pc]";
case Character.CONTROL:
return "CONTROL [Cc]";
case Character.DASH_PUNCTUATION:
return "DASH_PUNCTUATION [Pd]";
case Character.DECIMAL_DIGIT_NUMBER:
return "DECIMAL_DIGIT_NUMBER [Nd]";
case Character.ENCLOSING_MARK:
return "ENCLOSING_MARK [Me]";
case Character.END_PUNCTUATION:
return "END_PUNCTUATION [Pe]";
case Character.FINAL_QUOTE_PUNCTUATION:
return "FINAL_QUOTE_PUNCTUATION [Pf]";
case Character.FORMAT:
return "FORMAT [Cf]";
case Character.INITIAL_QUOTE_PUNCTUATION:
return "INITIAL_QUOTE_PUNCTUATION [Pi]";
case Character.LETTER_NUMBER:
return "LETTER_NUMBER [Nl]";
case Character.LINE_SEPARATOR:
return "LINE_SEPARATOR [Zl]";
case Character.LOWERCASE_LETTER:
return "LOWERCASE_LETTER [Ll]";
case Character.MATH_SYMBOL:
return "MATH_SYMBOL [Sm]";
case Character.MODIFIER_LETTER:
return "MODIFIER_LETTER [Lm]";
case Character.MODIFIER_SYMBOL:
return "MODIFIER_SYMBOL [Sk]";
case Character.NON_SPACING_MARK:
return "NON_SPACING_MARK [Mn]";
case Character.OTHER_LETTER:
return "OTHER_LETTER [Lo]";
case Character.OTHER_NUMBER:
return "OTHER_NUMBER [No]";
case Character.OTHER_PUNCTUATION:
return "OTHER_PUNCTUATION [Po]";
case Character.OTHER_SYMBOL:
return "OTHER_SYMBOL [So]";
case Character.PARAGRAPH_SEPARATOR:
return "PARAGRAPH_SEPARATOR [Zp]";
case Character.PRIVATE_USE:
return "PRIVATE_USE [Co]";
case Character.SPACE_SEPARATOR:
return "SPACE_SEPARATOR [Zs]";
case Character.START_PUNCTUATION:
return "START_PUNCTUATION [Ps]";
case Character.SURROGATE:
return "SURROGATE [Cs]";
case Character.TITLECASE_LETTER:
return "TITLECASE_LETTER [Lt]";
case Character.UNASSIGNED:
return "UNASSIGNED [Cn]";
case Character.UPPERCASE_LETTER:
return "UPPERCASE_LETTER [Lu]";
}
return "dont know";
}
Reacties
Een reactie posten