refactor: extract emojis_remover with dependency injection

- enables testing and benchmarking different implementations
Этот коммит содержится в:
Viginum-DataScientist-6 2025-05-26 12:54:43 +02:00
родитель 9d178deb74
Коммит ad5e63da8b
2 изменённых файлов: 100 добавлений и 41 удалений

90
d3lta/emojis_remover.py Обычный файл
Просмотреть файл

@ -0,0 +1,90 @@
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import final
import demoji
@dataclass
class EmojisRemover(ABC):
skip_postprocessing: bool = False
@final
def remove_symbols(self, text: str) -> str:
text_without_symbols = self._remove_symbols_implementation(text)
if self.skip_postprocessing:
return text_without_symbols
return self._postprocess(text_without_symbols)
def _postprocess(self, text: str) -> str:
# text = self._remove_whitespace_before_newline(text)
text_without_repeated_whitespace = self._remove_repeated_whitespace(text)
stripped_text_without_repeated_whitespace = (
text_without_repeated_whitespace.strip()
)
return stripped_text_without_repeated_whitespace
@abstractmethod
def _remove_symbols_implementation(self, text: str) -> str: ...
_whitespace_or_newline_capturing_group_name = "whitespace_or_newline"
_repeated_whitespace_pattern = re.compile(
rf"[ ]+(?P<{_whitespace_or_newline_capturing_group_name}> |\n)"
)
def _remove_repeated_whitespace(self, text: str) -> str:
return re.sub(
self._repeated_whitespace_pattern,
rf"\g<{self._whitespace_or_newline_capturing_group_name}>",
text,
)
class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover):
# Unicode ranges for most emojis
SYMBOLS_REGEX = re.compile(
"["
"\U00002000-\U0000206f" # General Punctuation
"\U00002190-\U000021ff" # Arrows
"\U00002300-\U000023ff" # Miscellaneous Technical
"\U00002400-\U0000243f" # Control Pictures
"\U00002440-\U0000245f" # Optical Character Recognition
"\U00002460-\U0000249f" # Enclosed Alphanumerics
"\U000024b0-\U000024ff" # Enclosed Alphanumerics Extension
"\U00002500-\U0000257f" # Box Drawing
"\U00002580-\U000025ff" # Block Elements
"\U00002600-\U000026ff" # Miscellaneous Symbols
"\U00002700-\U000027bf" # Dingbats
"\U000027c0-\U000027ef" # Miscellaneous Mathematical Symbols-A
"\U000027f0-\U000027ff" # Supplemental Arrows-A
"\U00002800-\U000028ff" # Braille Patterns
"\U00002900-\U0000297f" # Supplemental Arrows-B
"\U00002980-\U000029ff" # Miscellaneous Mathematical Symbols-B
"\U00002a00-\U00002aff" # Supplemental Mathematical Operators
"\U00002b00-\U00002bff" # Miscellaneous Symbols and Arrows
"\U00003200-\U0000325f" # Enclosed CJK Letters and Months
"\U0001f300-\U0001f5ff" # symbols & pictographs
"\U0001f600-\U0001f64f" # emoticons
"\U0001f680-\U0001f6ff" # transport & map symbols
"\U0001f700-\U0001f77f" # alchemical symbols
"\U0001f780-\U0001f7ff" # Geometric Shapes
"\U0001f800-\U0001f8ff" # Supplemental Arrows-C
"\U0001f900-\U0001f9ff" # Supplemental Symbols and Pictographs
"\U0001fa00-\U0001fa6f" # Chess Symbols
"\U0001fa70-\U0001faff" # Symbols and Pictographs Extended-A
"\U0001fb00-\U0001fbff" # Symbols for Legacy Computing
"\U0000200d" # Zero Width Joiner (ZWJ)
"\U0000fe0f" # Variation Selector-16 (emoji style)
"\U0000fe0e" # Variation Selector-15 (text style)
"]+"
)
def _remove_symbols_implementation(self, text: str) -> str:
return self.SYMBOLS_REGEX.sub(r"", text)
class DemojiEmojisRemover(EmojisRemover):
def _remove_symbols_implementation(self, text: str) -> str:
return demoji.replace(text)

Просмотреть файл

@ -20,6 +20,8 @@ from polyleven import levenshtein
from tqdm.auto import trange
from tqdm.contrib.concurrent import thread_map
from d3lta.emojis_remover import EmojisRemover, ExplicitUnicodeBlocksEmojisRemover
def timeit(func):
@wraps(func)
@ -48,46 +50,6 @@ def grouper(iterable, n):
#### Preprocessing Dataset ####
###############################
# Unicode ranges for most emojis
SYMBOL_REGEX = re.compile(
"["
"\U00002000-\U0000206F" # General Punctuation
"\U00002190-\U000021FF" # Arrows
"\U00002300-\U000023FF" # Miscellaneous Technical
"\U00002400-\U0000243F" # Control Pictures
"\U00002440-\U0000245F" # Optical Character Recognition
"\U00002460-\U0000249F" # Enclosed Alphanumerics
"\U000024B0-\U000024FF" # Enclosed Alphanumerics Extension
"\U00002500-\U0000257F" # Box Drawing
"\U00002580-\U000025FF" # Block Elements
"\U00002600-\U000026FF" # Miscellaneous Symbols
"\U00002700-\U000027BF" # Dingbats
"\U000027C0-\U000027EF" # Miscellaneous Mathematical Symbols-A
"\U000027F0-\U000027FF" # Supplemental Arrows-A
"\U00002800-\U000028FF" # Braille Patterns
"\U00002900-\U0000297F" # Supplemental Arrows-B
"\U00002980-\U000029FF" # Miscellaneous Mathematical Symbols-B
"\U00002A00-\U00002AFF" # Supplemental Mathematical Operators
"\U00002B00-\U00002BFF" # Miscellaneous Symbols and Arrows
"\U00003200-\U0000325F" # Enclosed CJK Letters and Months
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F600-\U0001F64F" # emoticons
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F700-\U0001F77F" # alchemical symbols
"\U0001F780-\U0001F7FF" # Geometric Shapes
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
"\U0001FA00-\U0001FA6F" # Chess Symbols
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
"\U0001FB00-\U0001FBFF" # Symbols for Legacy Computing
"\U0000200D" # Zero Width Joiner (ZWJ)
"\U0000FE0F" # Variation Selector-16 (emoji style)
"\U0000FE0E" # Variation Selector-15 (text style)
"]+"
)
def remove_symbols(text):
return SYMBOL_REGEX.sub(r'', text)
def preprocess_text(
s: str | list[str] | set[str] | frozenset[str] | pd.Series,
@ -100,6 +62,7 @@ def preprocess_text(
remove_twitter_cropend: bool = False,
replace_newline_characters: bool = True,
remove_punctuation: bool = False,
emojis_remover: EmojisRemover | None = None,
):
"""
clean a list-like of strings, performing all the following treatments by default
@ -114,10 +77,16 @@ def preprocess_text(
remove_twitter_cropend (bool, optional): remove Twitter-added "" character at the end of messages that are too long. Defaults to False.
replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True.
remove_punctuation (bool, optional): remove punctuation from the text, be careful, it will remove # of hashtags too. Defaults to False.
emojis_remover (EmojisRemover, optional):
if provided, overrides the default engine used for emojis matching and removal.
Has no effect if `remove_emojis` is set to False.
"""
if s is None:
s = ""
if emojis_remover is None:
emojis_remover = ExplicitUnicodeBlocksEmojisRemover()
assert isinstance(s, (str, list, pd.Series, set, frozenset))
if isinstance(s, str):
@ -145,7 +114,7 @@ def preprocess_text(
for msg in s
]
if remove_emojis:
s = [remove_symbols(msg).strip() for msg in s]
s = [emojis_remover.remove_symbols(msg).strip() for msg in s]
if remove_hashtags_frontend:
if (not remove_urls) or (not remove_mentions):