зеркало из
https://github.com/VIGINUM-FR/D3lta.git
synced 2025-10-29 05:04:20 +02:00
104 строки
4.7 KiB
Python
104 строки
4.7 KiB
Python
import re
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass
|
|
from typing import final
|
|
|
|
import demoji
|
|
|
|
|
|
@dataclass
|
|
class EmojisRemover(ABC):
|
|
skip_postprocessing: bool = False
|
|
|
|
@final
|
|
def remove_symbols(self, text: str) -> str:
|
|
text_without_symbols = self._remove_symbols_implementation(text)
|
|
if self.skip_postprocessing:
|
|
return text_without_symbols
|
|
|
|
return self._postprocess(text_without_symbols)
|
|
|
|
def _postprocess(self, text: str) -> str:
|
|
# text = self._remove_whitespace_before_newline(text)
|
|
text_without_repeated_whitespace = self._remove_repeated_whitespace(text)
|
|
stripped_text_without_repeated_whitespace = (
|
|
text_without_repeated_whitespace.strip()
|
|
)
|
|
return stripped_text_without_repeated_whitespace
|
|
|
|
@abstractmethod
|
|
def _remove_symbols_implementation(self, text: str) -> str: ...
|
|
|
|
_whitespace_or_newline_capturing_group_name = "whitespace_or_newline"
|
|
_repeated_whitespace_pattern = re.compile(
|
|
rf"[ ]+(?P<{_whitespace_or_newline_capturing_group_name}> |\n)"
|
|
)
|
|
|
|
def _remove_repeated_whitespace(self, text: str) -> str:
|
|
return re.sub(
|
|
self._repeated_whitespace_pattern,
|
|
rf"\g<{self._whitespace_or_newline_capturing_group_name}>",
|
|
text,
|
|
)
|
|
|
|
|
|
class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover):
|
|
# Unicode ranges for most emojis
|
|
SYMBOLS_REGEX = re.compile(
|
|
"["
|
|
"\U000020d0-\U000020ff" # Combining Diacritical Marks for Symbols
|
|
"\U00002190-\U000021ff" # Arrows
|
|
"\U00002300-\U000023ff" # Miscellaneous Technical
|
|
"\U00002400-\U0000243f" # Control Pictures
|
|
"\U00002440-\U0000245f" # Optical Character Recognition
|
|
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
|
"\U00002460-\U0000249f" # Enclosed Alphanumerics
|
|
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
|
"\U000024b0-\U000024ff" # Enclosed Alphanumerics Extension
|
|
"\U00002500-\U0000257f" # Box Drawing
|
|
"\U00002580-\U000025ff" # Block Elements
|
|
"\U00002600-\U000026ff" # Miscellaneous Symbols
|
|
"\U00002700-\U000027bf" # Dingbats
|
|
"\U000027c0-\U000027ef" # Miscellaneous Mathematical Symbols-A
|
|
"\U000027f0-\U000027ff" # Supplemental Arrows-A
|
|
"\U00002800-\U000028ff" # Braille Patterns
|
|
"\U00002900-\U0000297f" # Supplemental Arrows-B
|
|
"\U00002980-\U000029ff" # Miscellaneous Mathematical Symbols-B
|
|
"\U00002a00-\U00002aff" # Supplemental Mathematical Operators
|
|
"\U00002b00-\U00002bff" # Miscellaneous Symbols and Arrows
|
|
"\U00003000-\U0000303f" # CJK Symbols and Punctuation
|
|
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
|
"\U00003200-\U000032ff" # Enclosed CJK Letters and Months
|
|
"\U0001f000-\U0001f02f" # Mahjong Tiles
|
|
"\U0001f030-\U0001f09f" # Domino Tiles
|
|
"\U0001f0a0-\U0001f0ff" # Playing cards
|
|
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
|
"\U0001f100-\U0001f1ff" # Enclosed Alphanumeric Supplement
|
|
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
|
"\U0001f200-\U0001f2ff" # Enclosed Ideographic Supplement
|
|
"\U0001f300-\U0001f5ff" # Miscellaneous Symbols and Pictographs
|
|
"\U0001f600-\U0001f64f" # Emoticons
|
|
"\U0001f650-\U0001f67f" # Ornamental Dingbats
|
|
"\U0001f680-\U0001f6ff" # transport & map symbols
|
|
"\U0001f700-\U0001f77f" # alchemical symbols
|
|
"\U0001f780-\U0001f7ff" # Geometric Shapes
|
|
"\U0001f800-\U0001f8ff" # Supplemental Arrows-C
|
|
"\U0001f900-\U0001f9ff" # Supplemental Symbols and Pictographs
|
|
"\U0001fa00-\U0001fa6f" # Chess Symbols
|
|
"\U0001fa70-\U0001faff" # Symbols and Pictographs Extended-A
|
|
"\U0001fb00-\U0001fbff" # Symbols for Legacy Computing
|
|
"\U000e0000-\U000e007f" # Tags (used for modifying emojis with region modifiers in particular)
|
|
"\U0000200d" # Zero Width Joiner (ZWJ)
|
|
"\U0000fe0f" # Variation Selector-16 (emoji style)
|
|
"\U0000fe0e" # Variation Selector-15 (text style)
|
|
"]+"
|
|
)
|
|
|
|
def _remove_symbols_implementation(self, text: str) -> str:
|
|
return self.SYMBOLS_REGEX.sub(r"", text)
|
|
|
|
|
|
class DemojiEmojisRemover(EmojisRemover):
|
|
def _remove_symbols_implementation(self, text: str) -> str:
|
|
return demoji.replace(text)
|