Java perfect way to judge Chinese characters

  • 2020-04-01 01:30:37
  • OfStack

Java usually USES the regularity of Unicode encoding (0x4e00, 0x9fbb) to determine whether a string has Chinese characters, but this interval is not very accurate because of some Chinese punctuation marks such as:,. And so on are not recognized.

The following is a more complete method of judgment: charutil.java


import java.util.regex.Pattern; 

public class CharUtil { 

    public static void main(String[] args) { 
        String[] strArr = new String[] { "www.micmiu.com", "!@#$%^&*()_+{}[]|"'?/:;<>,.", " ! RMB... () -- "" ' ' ", ". ? , ", " Don't! ", " や め て ", " ga ", "???" }; 
        for (String str : strArr) { 
            System.out.println("===========>  Test string: " + str); 
            System.out.println(" Regular judgment result: " + isChineseByREG(str) + " -- " + isChineseByName(str)); 
            System.out.println("Unicode Determine the results   : " + isChinese(str)); 
            System.out.println(" Detailed judgment list: "); 
            char[] ch = str.toCharArray(); 
            for (int i = 0; i < ch.length; i++) { 
                char c = ch[i]; 
                System.out.println(c + " --> " + (isChinese(c) ? " is " : " no ")); 
            } 
        } 
    } 

    //Judging Chinese characters and symbols perfectly according to Unicode encoding
    private static boolean isChinese(char c) { 
        Character.UnicodeBlock ub = Character.UnicodeBlock.of(c); 
        if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 
                || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 
                || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS 
                || ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) { 
            return true; 
        } 
        return false; 
    } 

    //Complete judgment of Chinese characters and symbols
    public static boolean isChinese(String strName) { 
        char[] ch = strName.toCharArray(); 
        for (int i = 0; i < ch.length; i++) { 
            char c = ch[i]; 
            if (isChinese(c)) { 
                return true; 
            } 
        } 
        return false; 
    } 

    //Only some CJK characters can be judged.
    public static boolean isChineseByREG(String str) { 
        if (str == null) { 
            return false; 
        } 
        Pattern pattern = Pattern.compile("[\u4E00-\u9FBF]+"); 
        return pattern.matcher(str.trim()).find(); 
    } 

    //Only some CJK characters can be judged.
    public static boolean isChineseByName(String str) { 
        if (str == null) { 
            return false; 
        } 
        //Case is different: \p means contain, \p means not contain
        //\p{Cn} means the encoding of undefined characters in Unicode, and \p{Cn} means the encoding of defined characters in Unicode
        String reg = "\p{InCJK Unified Ideographs}&&\P{Cn}"; 
        Pattern pattern = Pattern.compile(reg); 
        return pattern.matcher(str.trim()).find(); 
    } 
} 


Related articles: