From ad5e63da8bdb12f0ec6b8f2bc9bc807a53db5776 Mon Sep 17 00:00:00 2001
From: Viginum-DataScientist-6
 <210390336+Viginum-DataScientist-6@users.noreply.github.com>
Date: Mon, 26 May 2025 12:54:43 +0200
Subject: [PATCH] refactor: extract emojis_remover with dependency injection

- enables testing and benchmarking different implementations
---
 d3lta/emojis_remover.py | 90 +++++++++++++++++++++++++++++++++++++++++
 d3lta/faissd3lta.py     | 51 +++++------------------
 2 files changed, 100 insertions(+), 41 deletions(-)
 create mode 100644 d3lta/emojis_remover.py

diff --git a/d3lta/emojis_remover.py b/d3lta/emojis_remover.py
new file mode 100644
index 0000000..60019c6
--- /dev/null
+++ b/d3lta/emojis_remover.py
@@ -0,0 +1,90 @@
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import final
+
+import demoji
+
+
+@dataclass
+class EmojisRemover(ABC):
+    skip_postprocessing: bool = False
+
+    @final
+    def remove_symbols(self, text: str) -> str:
+        text_without_symbols = self._remove_symbols_implementation(text)
+        if self.skip_postprocessing:
+            return text_without_symbols
+
+        return self._postprocess(text_without_symbols)
+
+    def _postprocess(self, text: str) -> str:
+        # text = self._remove_whitespace_before_newline(text)
+        text_without_repeated_whitespace = self._remove_repeated_whitespace(text)
+        stripped_text_without_repeated_whitespace = (
+            text_without_repeated_whitespace.strip()
+        )
+        return stripped_text_without_repeated_whitespace
+
+    @abstractmethod
+    def _remove_symbols_implementation(self, text: str) -> str: ...
+
+    _whitespace_or_newline_capturing_group_name = "whitespace_or_newline"
+    _repeated_whitespace_pattern = re.compile(
+        rf"[ ]+(?P<{_whitespace_or_newline_capturing_group_name}> |\n)"
+    )
+
+    def _remove_repeated_whitespace(self, text: str) -> str:
+        return re.sub(
+            self._repeated_whitespace_pattern,
+            rf"\g<{self._whitespace_or_newline_capturing_group_name}>",
+            text,
+        )
+
+
+class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover):
+    # Unicode ranges for most emojis
+    SYMBOLS_REGEX = re.compile(
+        "["
+        "\U00002000-\U0000206f"  # General Punctuation
+        "\U00002190-\U000021ff"  # Arrows
+        "\U00002300-\U000023ff"  # Miscellaneous Technical
+        "\U00002400-\U0000243f"  # Control Pictures
+        "\U00002440-\U0000245f"  # Optical Character Recognition
+        "\U00002460-\U0000249f"  # Enclosed Alphanumerics
+        "\U000024b0-\U000024ff"  # Enclosed Alphanumerics Extension
+        "\U00002500-\U0000257f"  # Box Drawing
+        "\U00002580-\U000025ff"  # Block Elements
+        "\U00002600-\U000026ff"  # Miscellaneous Symbols
+        "\U00002700-\U000027bf"  # Dingbats
+        "\U000027c0-\U000027ef"  # Miscellaneous Mathematical Symbols-A
+        "\U000027f0-\U000027ff"  # Supplemental Arrows-A
+        "\U00002800-\U000028ff"  # Braille Patterns
+        "\U00002900-\U0000297f"  # Supplemental Arrows-B
+        "\U00002980-\U000029ff"  # Miscellaneous Mathematical Symbols-B
+        "\U00002a00-\U00002aff"  # Supplemental Mathematical Operators
+        "\U00002b00-\U00002bff"  # Miscellaneous Symbols and Arrows
+        "\U00003200-\U0000325f"  # Enclosed CJK Letters and Months
+        "\U0001f300-\U0001f5ff"  # symbols & pictographs
+        "\U0001f600-\U0001f64f"  # emoticons
+        "\U0001f680-\U0001f6ff"  # transport & map symbols
+        "\U0001f700-\U0001f77f"  # alchemical symbols
+        "\U0001f780-\U0001f7ff"  # Geometric Shapes
+        "\U0001f800-\U0001f8ff"  # Supplemental Arrows-C
+        "\U0001f900-\U0001f9ff"  # Supplemental Symbols and Pictographs
+        "\U0001fa00-\U0001fa6f"  # Chess Symbols
+        "\U0001fa70-\U0001faff"  # Symbols and Pictographs Extended-A
+        "\U0001fb00-\U0001fbff"  # Symbols for Legacy Computing
+        "\U0000200d"  # Zero Width Joiner (ZWJ)
+        "\U0000fe0f"  # Variation Selector-16 (emoji style)
+        "\U0000fe0e"  # Variation Selector-15 (text style)
+        "]+"
+    )
+
+    def _remove_symbols_implementation(self, text: str) -> str:
+        return self.SYMBOLS_REGEX.sub(r"", text)
+
+
+class DemojiEmojisRemover(EmojisRemover):
+    def _remove_symbols_implementation(self, text: str) -> str:
+        return demoji.replace(text)
diff --git a/d3lta/faissd3lta.py b/d3lta/faissd3lta.py
index d044cac..59ead83 100644
--- a/d3lta/faissd3lta.py
+++ b/d3lta/faissd3lta.py
@@ -20,6 +20,8 @@ from polyleven import levenshtein
 from tqdm.auto import trange
 from tqdm.contrib.concurrent import thread_map
 
+from d3lta.emojis_remover import EmojisRemover, ExplicitUnicodeBlocksEmojisRemover
+
 
 def timeit(func):
     @wraps(func)
@@ -48,46 +50,6 @@ def grouper(iterable, n):
 #### Preprocessing Dataset ####
 ###############################
 
-# Unicode ranges for most emojis
-SYMBOL_REGEX = re.compile(
-    "["
-    "\U00002000-\U0000206F"  # General Punctuation
-    "\U00002190-\U000021FF"  # Arrows
-    "\U00002300-\U000023FF"  # Miscellaneous Technical
-    "\U00002400-\U0000243F"  # Control Pictures
-    "\U00002440-\U0000245F"  # Optical Character Recognition
-    "\U00002460-\U0000249F"  # Enclosed Alphanumerics
-    "\U000024B0-\U000024FF"  # Enclosed Alphanumerics Extension 
-    "\U00002500-\U0000257F"  # Box Drawing
-    "\U00002580-\U000025FF"  # Block Elements
-    "\U00002600-\U000026FF"  # Miscellaneous Symbols
-    "\U00002700-\U000027BF"  # Dingbats
-    "\U000027C0-\U000027EF"  # Miscellaneous Mathematical Symbols-A
-    "\U000027F0-\U000027FF"  # Supplemental Arrows-A
-    "\U00002800-\U000028FF"  # Braille Patterns
-    "\U00002900-\U0000297F"  # Supplemental Arrows-B
-    "\U00002980-\U000029FF"  # Miscellaneous Mathematical Symbols-B
-    "\U00002A00-\U00002AFF"  # Supplemental Mathematical Operators
-    "\U00002B00-\U00002BFF"  # Miscellaneous Symbols and Arrows
-    "\U00003200-\U0000325F"  # Enclosed CJK Letters and Months
-    "\U0001F300-\U0001F5FF"  # symbols & pictographs
-    "\U0001F600-\U0001F64F"  # emoticons
-    "\U0001F680-\U0001F6FF"  # transport & map symbols
-    "\U0001F700-\U0001F77F"  # alchemical symbols
-    "\U0001F780-\U0001F7FF"  # Geometric Shapes
-    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
-    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
-    "\U0001FA00-\U0001FA6F"  # Chess Symbols
-    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
-    "\U0001FB00-\U0001FBFF"  # Symbols for Legacy Computing
-    "\U0000200D"              # Zero Width Joiner (ZWJ)
-    "\U0000FE0F"              # Variation Selector-16 (emoji style)
-    "\U0000FE0E"              # Variation Selector-15 (text style)
-    "]+"
-)
-
-def remove_symbols(text):
-    return SYMBOL_REGEX.sub(r'', text)
 
 def preprocess_text(
     s: str | list[str] | set[str] | frozenset[str] | pd.Series,
@@ -100,6 +62,7 @@ def preprocess_text(
     remove_twitter_cropend: bool = False,
     replace_newline_characters: bool = True,
     remove_punctuation: bool = False,
+    emojis_remover: EmojisRemover | None = None,
 ):
     """
     clean a list-like of strings, performing all the following treatments by default
@@ -114,10 +77,16 @@ def preprocess_text(
         remove_twitter_cropend (bool, optional): remove Twitter-added "…" character at the end of messages that are too long. Defaults to False.
         replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True.
         remove_punctuation (bool, optional): remove punctuation from the text, be careful, it will remove # of hashtags too. Defaults to False.
+        emojis_remover (EmojisRemover, optional):
+            if provided, overrides the default engine used for emojis matching and removal.
+            Has no effect if `remove_emojis` is set to False.
     """
     if s is None:
         s = ""
 
+    if emojis_remover is None:
+        emojis_remover = ExplicitUnicodeBlocksEmojisRemover()
+
     assert isinstance(s, (str, list, pd.Series, set, frozenset))
 
     if isinstance(s, str):
@@ -145,7 +114,7 @@ def preprocess_text(
             for msg in s
         ]
     if remove_emojis:
-        s = [remove_symbols(msg).strip() for msg in s]
+        s = [emojis_remover.remove_symbols(msg).strip() for msg in s]
 
     if remove_hashtags_frontend:
         if (not remove_urls) or (not remove_mentions):