Java perfect way to judge Chinese characters
- 2020-04-01 01:30:37
- OfStack
Java usually USES the regularity of Unicode encoding (0x4e00, 0x9fbb) to determine whether a string has Chinese characters, but this interval is not very accurate because of some Chinese punctuation marks such as:,. And so on are not recognized.
The following is a more complete method of judgment: charutil.java
import java.util.regex.Pattern;
public class CharUtil {
public static void main(String[] args) {
String[] strArr = new String[] { "www.micmiu.com", "!@#$%^&*()_+{}[]|"'?/:;<>,.", " ! RMB... () -- "" ' ' ", ". ? , ", " Don't! ", " や め て ", " ga ", "???" };
for (String str : strArr) {
System.out.println("===========> Test string: " + str);
System.out.println(" Regular judgment result: " + isChineseByREG(str) + " -- " + isChineseByName(str));
System.out.println("Unicode Determine the results : " + isChinese(str));
System.out.println(" Detailed judgment list: ");
char[] ch = str.toCharArray();
for (int i = 0; i < ch.length; i++) {
char c = ch[i];
System.out.println(c + " --> " + (isChinese(c) ? " is " : " no "));
}
}
}
//Judging Chinese characters and symbols perfectly according to Unicode encoding
private static boolean isChinese(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {
return true;
}
return false;
}
//Complete judgment of Chinese characters and symbols
public static boolean isChinese(String strName) {
char[] ch = strName.toCharArray();
for (int i = 0; i < ch.length; i++) {
char c = ch[i];
if (isChinese(c)) {
return true;
}
}
return false;
}
//Only some CJK characters can be judged.
public static boolean isChineseByREG(String str) {
if (str == null) {
return false;
}
Pattern pattern = Pattern.compile("[\u4E00-\u9FBF]+");
return pattern.matcher(str.trim()).find();
}
//Only some CJK characters can be judged.
public static boolean isChineseByName(String str) {
if (str == null) {
return false;
}
//Case is different: \p means contain, \p means not contain
//\p{Cn} means the encoding of undefined characters in Unicode, and \p{Cn} means the encoding of defined characters in Unicode
String reg = "\p{InCJK Unified Ideographs}&&\P{Cn}";
Pattern pattern = Pattern.compile(reg);
return pattern.matcher(str.trim()).find();
}
}