[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002ebef\U00030000-\U000323af\ufa0e\ufa0f\ufa11\ufa13\ufa14\ufa1f\ufa21\ufa23\ufa24\ufa27\ufa28\ufa29\u3006\u3007][\ufe00-\ufe0f\U000e0100-\U000e01ef]?
pip install regex
)[\p{Unified_Ideograph}\u3006\u3007][\ufe00-\ufe0f\U000e0100-\U000e01ef]?
Can I use RegExp: Unicode property escapes
[\p{Unified_Ideograph}\u3006\u3007][\ufe00-\ufe0f\u{e0100}-\u{e01ef}]?
[\u4e00-\u9fff\u3400-\u4dbf\u{20000}-\u{2a6df}\u{2a700}-\u{2ebef}\u{30000}-\u{323af}\ufa0e\ufa0f\ufa11\ufa13\ufa14\ufa1f\ufa21\ufa23\ufa24\ufa27\ufa28\ufa29\u3006\u3007][\ufe00-\ufe0f\u{e0100}-\u{e01ef}]?
([\u4e00-\u9fff\u3400-\u4dbf\ufa0e\ufa0f\ufa11\ufa13\ufa14\ufa1f\ufa21\ufa23\ufa24\ufa27\ufa28\ufa29\u3006\u3007]|[\ud840-\ud868\ud86a-\ud879\ud880-\ud887][\udc00-\udfff]|\ud869[\udc00-\udedf\udf00-\udfff]|\ud87a[\udc00-\udfef]|\ud888[\udc00-\udfaf])([\ufe00-\ufe0f]|\udb40[\udd00-\uddef])?
import json
import re
= re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002ebef\U00030000-\U000323af\ufa0e\ufa0f\ufa11\ufa13\ufa14\ufa1f\ufa21\ufa23\ufa24\ufa27\ufa28\ufa29\u3006\u3007][\ufe00-\ufe0f\U000e0100-\U000e01ef]?')
pattern
for i, match in enumerate(pattern.finditer('a〆文𦫖﨑禰󠄀')):
print(f'Match {i}:', match[0], json.dumps(match[0]))
# Match 0: 〆 "\u3006"
# Match 1: 文 "\u6587"
# Match 2: 𦫖 "\ud85a\uded6"
# Match 3: 﨑 "\ufa11"
# Match 4: 禰󠄀 "\u79b0\udb40\udd00"
pip install regex
)import json
import regex as re
= re.compile(r'[\p{Unified_Ideograph}\u3006\u3007][\ufe00-\ufe0f\U000e0100-\U000e01ef]?')
pattern
for i, match in enumerate(pattern.finditer('a〆文𦫖﨑禰󠄀')):
print(f'Match {i}:', match[0], json.dumps(match[0]))
# Match 0: 〆 "\u3006"
# Match 1: 文 "\u6587"
# Match 2: 𦫖 "\ud85a\uded6"
# Match 3: 﨑 "\ufa11"
# Match 4: 禰󠄀 "\u79b0\udb40\udd00"
Can I use RegExp: Unicode property escapes
const pattern = /[\p{Unified_Ideograph}\u3006\u3007][\ufe00-\ufe0f\u{e0100}-\u{e01ef}]?/gmu;
'a〆文𦫖﨑禰󠄀'.match(pattern).forEach((match, i) => {
console.log(`Match ${i}: ${match}, length: ${match.length}`);
;
})// Match 0: 〆, length: 1
// Match 1: 文, length: 1
// Match 2: 𦫖, length: 2
// Match 3: 﨑, length: 1
// Match 4: 禰󠄀, length: 3
const pattern = /[\u4e00-\u9fff\u3400-\u4dbf\u{20000}-\u{2a6df}\u{2a700}-\u{2ebef}\u{30000}-\u{323af}\ufa0e\ufa0f\ufa11\ufa13\ufa14\ufa1f\ufa21\ufa23\ufa24\ufa27\ufa28\ufa29\u3006\u3007][\ufe00-\ufe0f\u{e0100}-\u{e01ef}]?/gmu;
'a〆文𦫖﨑禰󠄀'.match(pattern).forEach((match, i) => {
console.log(`Match ${i}: ${match}, length: ${match.length}`);
;
})// Match 0: 〆, length: 1
// Match 1: 文, length: 1
// Match 2: 𦫖, length: 2
// Match 3: 﨑, length: 1
// Match 4: 禰󠄀, length: 3
const pattern = /([\u4e00-\u9fff\u3400-\u4dbf\ufa0e\ufa0f\ufa11\ufa13\ufa14\ufa1f\ufa21\ufa23\ufa24\ufa27\ufa28\ufa29\u3006\u3007]|[\ud840-\ud868\ud86a-\ud879\ud880-\ud887][\udc00-\udfff]|\ud869[\udc00-\udedf\udf00-\udfff]|\ud87a[\udc00-\udfef]|\ud888[\udc00-\udfaf])([\ufe00-\ufe0f]|\udb40[\udd00-\uddef])?/gm;
'a〆文𦫖﨑禰󠄀'.match(pattern).forEach((match, i) => {
console.log(`Match ${i}: ${match}, length: ${match.length}`);
;
})// Match 0: 〆, length: 1
// Match 1: 文, length: 1
// Match 2: 𦫖, length: 2
// Match 3: 﨑, length: 1
// Match 4: 禰󠄀, length: 3
CJK Unified Ideographs:
U+4E00-U+9FFF
: CJK Unified IdeographsU+3400-U+4DBF
: CJK Unified Ideographs Extension AU+20000-U+2A6DF
: CJK Unified Ideographs Extension BU+2A700-U+2B73F
: CJK Unified Ideographs Extension CU+2B740-U+2B81F
: CJK Unified Ideographs Extension DU+2B820-U+2CEAF
: CJK Unified Ideographs Extension EU+2CEB0-U+2EBEF
: CJK Unified Ideographs Extension FU+30000-U+3134F
: CJK Unified Ideographs Extension GU+31350–U+323AF
: CJK Unified Ideographs Extension H12 CJK Unified Ideographs in the CJK Compatibility Ideographs block:
U+FA0E
: 﨎U+FA0F
: 﨏U+FA11
: 﨑U+FA13
: 﨓U+FA14
: 﨔U+FA1F
: 﨟U+FA21
: 﨡U+FA23
: 﨣U+FA24
: 﨤U+FA27
: 﨧U+FA28
: 﨨U+FA29
: 﨩2 characters in the CJK Symbols and Punctuation block that are often regarded as Chinese characters:
U+3006
: 〆U+3007
: 〇Variation Selectors:
U+FE00-U+FE0F
: Variation SelectorsU+E0100-U+E01EF
: Variation Selectors Supplement\p{sc=Han}
(means the Han script in Unicode) is wrong because it selects more than Chinese characters\p{Ideo}
(means the Ideograph property in Unicode) is wrong because it selects more than Chinese characters\p{Variation_Selector}
is wrong because it also selects Mongolian variation selectors