зеркало из
https://github.com/VIGINUM-FR/D3lta.git
synced 2025-10-29 13:06:10 +02:00
fix: fix unicode ranges for emojis detection
Этот коммит содержится в:
родитель
8cbd9d87a9
Коммит
72244a8ade
@ -46,12 +46,14 @@ class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover):
|
||||
# Unicode ranges for most emojis
|
||||
SYMBOLS_REGEX = re.compile(
|
||||
"["
|
||||
"\U00002000-\U0000206f" # General Punctuation
|
||||
"\U000020d0-\U000020ff" # Combining Diacritical Marks for Symbols
|
||||
"\U00002190-\U000021ff" # Arrows
|
||||
"\U00002300-\U000023ff" # Miscellaneous Technical
|
||||
"\U00002400-\U0000243f" # Control Pictures
|
||||
"\U00002440-\U0000245f" # Optical Character Recognition
|
||||
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
||||
"\U00002460-\U0000249f" # Enclosed Alphanumerics
|
||||
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
||||
"\U000024b0-\U000024ff" # Enclosed Alphanumerics Extension
|
||||
"\U00002500-\U0000257f" # Box Drawing
|
||||
"\U00002580-\U000025ff" # Block Elements
|
||||
@ -64,9 +66,19 @@ class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover):
|
||||
"\U00002980-\U000029ff" # Miscellaneous Mathematical Symbols-B
|
||||
"\U00002a00-\U00002aff" # Supplemental Mathematical Operators
|
||||
"\U00002b00-\U00002bff" # Miscellaneous Symbols and Arrows
|
||||
"\U00003200-\U0000325f" # Enclosed CJK Letters and Months
|
||||
"\U0001f300-\U0001f5ff" # symbols & pictographs
|
||||
"\U0001f600-\U0001f64f" # emoticons
|
||||
"\U00003000-\U0000303f" # CJK Symbols and Punctuation
|
||||
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
||||
"\U00003200-\U000032ff" # Enclosed CJK Letters and Months
|
||||
"\U0001f000-\U0001f02f" # Mahjong Tiles
|
||||
"\U0001f030-\U0001f09f" # Domino Tiles
|
||||
"\U0001f0a0-\U0001f0ff" # Playing cards
|
||||
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
||||
"\U0001f100-\U0001f1ff" # Enclosed Alphanumeric Supplement
|
||||
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
||||
"\U0001f200-\U0001f2ff" # Enclosed Ideographic Supplement
|
||||
"\U0001f300-\U0001f5ff" # Miscellaneous Symbols and Pictographs
|
||||
"\U0001f600-\U0001f64f" # Emoticons
|
||||
"\U0001f650-\U0001f67f" # Ornamental Dingbats
|
||||
"\U0001f680-\U0001f6ff" # transport & map symbols
|
||||
"\U0001f700-\U0001f77f" # alchemical symbols
|
||||
"\U0001f780-\U0001f7ff" # Geometric Shapes
|
||||
@ -75,6 +87,7 @@ class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover):
|
||||
"\U0001fa00-\U0001fa6f" # Chess Symbols
|
||||
"\U0001fa70-\U0001faff" # Symbols and Pictographs Extended-A
|
||||
"\U0001fb00-\U0001fbff" # Symbols for Legacy Computing
|
||||
"\U000e0000-\U000e007f" # Tags (used for modifying emojis with region modifiers in particular)
|
||||
"\U0000200d" # Zero Width Joiner (ZWJ)
|
||||
"\U0000fe0f" # Variation Selector-16 (emoji style)
|
||||
"\U0000fe0e" # Variation Selector-15 (text style)
|
||||
|
||||
Загрузка…
x
Ссылка в новой задаче
Block a user