[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002ebef\U00030000-\U000323af\ufa0e\ufa0f\ufa11\ufa13\ufa14\ufa1f\ufa21\ufa23\ufa24\ufa27\ufa28\ufa29\u3006\u3007][\ufe00-\ufe0f\U000e0100-\U000e01ef]?
pip install regex)[\p{Unified_Ideograph}\u3006\u3007][\ufe00-\ufe0f\U000e0100-\U000e01ef]?
Can I use RegExp: Unicode property escapes
[\p{Unified_Ideograph}\u3006\u3007][\ufe00-\ufe0f\u{e0100}-\u{e01ef}]?
[\u4e00-\u9fff\u3400-\u4dbf\u{20000}-\u{2a6df}\u{2a700}-\u{2ebef}\u{30000}-\u{323af}\ufa0e\ufa0f\ufa11\ufa13\ufa14\ufa1f\ufa21\ufa23\ufa24\ufa27\ufa28\ufa29\u3006\u3007][\ufe00-\ufe0f\u{e0100}-\u{e01ef}]?
([\u4e00-\u9fff\u3400-\u4dbf\ufa0e\ufa0f\ufa11\ufa13\ufa14\ufa1f\ufa21\ufa23\ufa24\ufa27\ufa28\ufa29\u3006\u3007]|[\ud840-\ud868\ud86a-\ud879\ud880-\ud887][\udc00-\udfff]|\ud869[\udc00-\udedf\udf00-\udfff]|\ud87a[\udc00-\udfef]|\ud888[\udc00-\udfaf])([\ufe00-\ufe0f]|\udb40[\udd00-\uddef])?
import json
import re
pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002ebef\U00030000-\U000323af\ufa0e\ufa0f\ufa11\ufa13\ufa14\ufa1f\ufa21\ufa23\ufa24\ufa27\ufa28\ufa29\u3006\u3007][\ufe00-\ufe0f\U000e0100-\U000e01ef]?')
for i, match in enumerate(pattern.finditer('a〆文𦫖﨑禰󠄀')):
print(f'Match {i}:', match[0], json.dumps(match[0]))
# Match 0: 〆 "\u3006"
# Match 1: 文 "\u6587"
# Match 2: 𦫖 "\ud85a\uded6"
# Match 3: 﨑 "\ufa11"
# Match 4: 禰󠄀 "\u79b0\udb40\udd00"pip install regex)import json
import regex as re
pattern = re.compile(r'[\p{Unified_Ideograph}\u3006\u3007][\ufe00-\ufe0f\U000e0100-\U000e01ef]?')
for i, match in enumerate(pattern.finditer('a〆文𦫖﨑禰󠄀')):
print(f'Match {i}:', match[0], json.dumps(match[0]))
# Match 0: 〆 "\u3006"
# Match 1: 文 "\u6587"
# Match 2: 𦫖 "\ud85a\uded6"
# Match 3: 﨑 "\ufa11"
# Match 4: 禰󠄀 "\u79b0\udb40\udd00"Can I use RegExp: Unicode property escapes
const pattern = /[\p{Unified_Ideograph}\u3006\u3007][\ufe00-\ufe0f\u{e0100}-\u{e01ef}]?/gmu;
'a〆文𦫖﨑禰󠄀'.match(pattern).forEach((match, i) => {
console.log(`Match ${i}: ${match}, length: ${match.length}`);
});
// Match 0: 〆, length: 1
// Match 1: 文, length: 1
// Match 2: 𦫖, length: 2
// Match 3: 﨑, length: 1
// Match 4: 禰󠄀, length: 3const pattern = /[\u4e00-\u9fff\u3400-\u4dbf\u{20000}-\u{2a6df}\u{2a700}-\u{2ebef}\u{30000}-\u{323af}\ufa0e\ufa0f\ufa11\ufa13\ufa14\ufa1f\ufa21\ufa23\ufa24\ufa27\ufa28\ufa29\u3006\u3007][\ufe00-\ufe0f\u{e0100}-\u{e01ef}]?/gmu;
'a〆文𦫖﨑禰󠄀'.match(pattern).forEach((match, i) => {
console.log(`Match ${i}: ${match}, length: ${match.length}`);
});
// Match 0: 〆, length: 1
// Match 1: 文, length: 1
// Match 2: 𦫖, length: 2
// Match 3: 﨑, length: 1
// Match 4: 禰󠄀, length: 3const pattern = /([\u4e00-\u9fff\u3400-\u4dbf\ufa0e\ufa0f\ufa11\ufa13\ufa14\ufa1f\ufa21\ufa23\ufa24\ufa27\ufa28\ufa29\u3006\u3007]|[\ud840-\ud868\ud86a-\ud879\ud880-\ud887][\udc00-\udfff]|\ud869[\udc00-\udedf\udf00-\udfff]|\ud87a[\udc00-\udfef]|\ud888[\udc00-\udfaf])([\ufe00-\ufe0f]|\udb40[\udd00-\uddef])?/gm;
'a〆文𦫖﨑禰󠄀'.match(pattern).forEach((match, i) => {
console.log(`Match ${i}: ${match}, length: ${match.length}`);
});
// Match 0: 〆, length: 1
// Match 1: 文, length: 1
// Match 2: 𦫖, length: 2
// Match 3: 﨑, length: 1
// Match 4: 禰󠄀, length: 3CJK Unified Ideographs:
U+4E00-U+9FFF: CJK Unified IdeographsU+3400-U+4DBF: CJK Unified Ideographs Extension AU+20000-U+2A6DF: CJK Unified Ideographs Extension BU+2A700-U+2B73F: CJK Unified Ideographs Extension CU+2B740-U+2B81F: CJK Unified Ideographs Extension DU+2B820-U+2CEAF: CJK Unified Ideographs Extension EU+2CEB0-U+2EBEF: CJK Unified Ideographs Extension FU+30000-U+3134F: CJK Unified Ideographs Extension GU+31350–U+323AF: CJK Unified Ideographs Extension H12 CJK Unified Ideographs in the CJK Compatibility Ideographs block:
U+FA0E: 﨎U+FA0F: 﨏U+FA11: 﨑U+FA13: 﨓U+FA14: 﨔U+FA1F: 﨟U+FA21: 﨡U+FA23: 﨣U+FA24: 﨤U+FA27: 﨧U+FA28: 﨨U+FA29: 﨩2 characters in the CJK Symbols and Punctuation block that are often regarded as Chinese characters:
U+3006: 〆U+3007: 〇Variation Selectors:
U+FE00-U+FE0F: Variation SelectorsU+E0100-U+E01EF: Variation Selectors Supplement\p{sc=Han} (means the Han script in Unicode) is wrong because it selects more than Chinese characters\p{Ideo} (means the Ideograph property in Unicode) is wrong because it selects more than Chinese characters\p{Variation_Selector} is wrong because it also selects Mongolian variation selectors