fix(devcontainer): reflect updates to pyproject.toml in #22

refactor: convert [tool.poetry] to [project]
- Use the recommended PEP621 syntax for the pyproject.toml
2025-10-29 05:04:20 +02:00 · 2025-07-31 18:50:36 +02:00 · 2025-07-31 13:39:03 +02:00 · 2025-07-31 13:39:03 +02:00 · 2025-07-30 16:59:23 +02:00 · 2025-07-30 16:58:22 +02:00
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@ -0,0 +1,34 @@
+FROM mcr.microsoft.com/devcontainers/python:1-3.11-bookworm AS d3lta-prod
+
+ENV PIP_DISABLE_PIP_VERSION_CHECK=on \
+    PIP_DEFAULT_TIMEOUT=100 \
+    \
+    # Poetry
+    # https://python-poetry.org/docs/configuration/#using-environment-variables
+    POETRY_VERSION=2.1.1 \
+    # make poetry install to this location
+    POETRY_HOME="/opt/poetry" \
+    # do not ask any interactive questions
+    POETRY_NO_INTERACTION=1 \
+    # never create virtual environments automatically
+    POETRY_VIRTUALENVS_CREATE=false
+
+RUN pip install --no-cache-dir --upgrade pip
+RUN pipx install poetry==${POETRY_VERSION}
+
+WORKDIR /app
+COPY pyproject.toml poetry.lock README.md LICENSE.txt ./
+# pre-install dependencies
+RUN --mount=type=cache,target=/root/.cache poetry install --no-root
+
+COPY notebooks /app/notebooks/
+COPY d3lta /app/d3lta/
+RUN --mount=type=cache,target=/root/.cache poetry install
+
+FROM d3lta-prod AS d3lta-dev
+
+RUN --mount=type=cache,target=/root/.cache poetry install --with dev
+
+# install nektos/act as specified in https://nektosact.com/installation/index.html#bash-script
+# the -b flag specifies the target directory (cf. https://github.com/nektos/act/blob/61396d8085a9d812cebf94fa954f5938d48bf2b9/install.sh#L13)
+RUN curl --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash -s -- -b /usr/bin
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -0,0 +1,17 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/python
+{
+	"name": "Python 3",
+	"dockerFile": "./Dockerfile",
+	"context": "..",
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"github.vscode-github-actions"
+			]
+		}
+	},
+	"features": {
+		"ghcr.io/devcontainers/features/docker-in-docker:2": {}
+	}
+}
--- a/.github/workflows/publish-to-pypi.yml
+++ b/.github/workflows/publish-to-pypi.yml
@ -0,0 +1,86 @@
+# derived from https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/#the-whole-ci-cd-workflow
+name: Publish Python distribution to PyPI
+
+on:
+  release:
+    types: [published]
+
+env:
+  ACT: false  # env.ACT == true when running inside nektos/act
+
+jobs:
+  build:
+    name: Build distribution
+    # based on https://stackoverflow.com/a/74318141
+    if: ${{ github.event.release.target_commitish == 'main'}}
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        persist-credentials: false
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.x"
+
+    - name: Build a binary wheel and a source tarball
+      run: pipx run build
+
+    - name: Store the distribution packages
+      uses: actions/upload-artifact@v4
+      with:
+        name: distfiles
+        path: dist/
+        if-no-files-found: error
+
+  # taken from https://github.com/python-poetry/poetry/blob/b580e8aa4fbce53569420e7b42568dfd9e73519f/.github/workflows/release.yaml
+  upload-built-distribution-to-github-release:
+    name: Upload (GitHub)
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    needs: build
+    steps:
+      # Checking-out the project since the gh CLI expects to be called in the context of a git repository.
+      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+
+      - name: Retrieve built distribution
+        uses: actions/download-artifact@v4
+        with:
+          name: distfiles
+          path: dist/
+
+      - run: gh release upload "${TAG_NAME}" dist/*.{tar.gz,whl}
+        # skip step when debugging locally via nektos/act
+        if: ${{ !env.ACT }}
+        env:
+          GH_TOKEN: ${{ github.token }}
+          TAG_NAME: ${{ github.event.release.tag_name }}
+
+  publish-to-pypi:
+    name: Publish Python distribution to PyPI
+    needs: build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/d3lta  # pypi is case insensitive so d3lta == D3lta
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+
+    steps:
+      - name: Retrieve built distribution
+        uses: actions/download-artifact@v4
+        with:
+          name: distfiles
+          path: dist/
+
+      - name: Publish distribution to PyPI
+        # skip step when debugging locally via nektos/act
+        if: ${{ !env.ACT }}
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          print-hash: true
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -0,0 +1,28 @@
+name: Run tests
+
+on: push
+
+jobs:
+  unit-test:
+    name: Run unit tests
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        persist-credentials: false
+
+    - name: Install poetry
+      run: pipx install poetry
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.11"
+        cache: poetry
+
+    - name: Install dependencies
+      run: poetry install --with dev
+
+    - name: Run tests
+      run: poetry run pytest
--- a/.gitignore
+++ b/.gitignore
@ -227,4 +227,6 @@ pyrightconfig.json
 *.pkl
 *.ftz
 use_model_kaggle/
-include/
+include/
+.benchmarks
+.act-event.json
--- a/README.md
+++ b/README.md
@ -1,43 +1,22 @@
-<h2 align="center"> <a href="https://arxiv.org/abs/2312.17338">D3lta</a></h2>
+# D3lta

-<h5 align="center"> 
-
-If you like our project, please give us a star ⭐ on GitHub for the latest update.  </h2>
-
-</h5>
-
-<div align=center>
-  
-[![arXiv](https://img.shields.io/badge/Arxiv-2312.17338-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2312.17338) 
+[![PyPI - Version](https://img.shields.io/pypi/v/d3lta?style=flat&logo=pypi&logoColor=%233775A9&label=PyPI)](https://pypi.org/project/d3lta/)
+[![arXiv](https://img.shields.io/badge/Arxiv-2312.17338-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2312.17338)

 This repository is the official implementation of D3lta, a library for detecting duplicate verbatim contents within a vast amount of documents.

 It distinguishes 3 types of duplicate contents : copypasta (almost exact duplicates), rewording and translation. You can run it on CPU.
-</div>

 ---

-<img style="display: block; margin: auto;" src="./static/graph.gif"/>
+<img style="display: block; margin: auto;" src="https://github.com/VIGINUM-FR/D3lta/raw/main/static/graph.gif"/>


-## 💻 Installing 
-
-Clone the repository
+## 💻 Installation

 ```bash
-git clone https://github.com/VIGINUM-FR/D3lta
-```
-
-Navigate to the project
-
-```bash
-cd D3lta
-```
-
-Install the package
-
-```bash
-pip install -e .
+# PyPI is case insensitive, so d3lta == D3lta
+pip install d3lta
 ```

 ## 🚀 Quick start
@ -163,11 +142,10 @@ matches, df_clusters = semantic_faiss(
 matches
 ```

-
-
 ## 📚 Synthetic dataset

-The dataset is available in the release `1.0.0`. It contains the following files:
+The dataset is available in the [`1.0.0` release](https://github.com/VIGINUM-FR/D3lta/releases/tag/1.0.0).
+It contains the following files:

 ### `synthetic_dataset_documents.csv`:

@ -205,10 +183,30 @@ Column details:

 ## Notebooks

-In folder the [`notebooks`](./notebooks/), you can find: 
- [`example_synthetic_dataset.ipynb`](./notebooks/example_synthetic_dataset.ipynb): Example of applying threedelta methodology to the synthetic dataset, with a comparison to the true labels.
+In the [`notebooks`](./notebooks/) directory, you can find: 
+- [`example_synthetic_dataset.ipynb`](./notebooks/example_synthetic_dataset.ipynb): example of applying the D3lta methodology to the synthetic dataset, with a comparison to the true labels.


+## 👩‍💻 Developing
+
+Clone the repository
+
+```bash
+git clone https://github.com/VIGINUM-FR/D3lta
+```
+
+Navigate to the project
+
+```bash
+cd D3lta
+```
+
+Install the package
+
+```bash
+pip install -e .
+```
+
 ## Citation

 If you find our paper and code useful in your research, please consider giving a star 🌟  and a citation 📝:
--- a/d3lta/emojis_remover.py
+++ b/d3lta/emojis_remover.py
@ -0,0 +1,96 @@
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import final
+
+
+@dataclass
+class EmojisRemover(ABC):
+    skip_postprocessing: bool = False
+
+    @final
+    def remove_symbols(self, text: str) -> str:
+        text_without_symbols = self._remove_symbols_implementation(text)
+        if self.skip_postprocessing:
+            return text_without_symbols
+
+        return self._postprocess(text_without_symbols)
+
+    def _postprocess(self, text: str) -> str:
+        # text = self._remove_whitespace_before_newline(text)
+        text_without_repeated_whitespace = self._remove_repeated_whitespace(text)
+        stripped_text_without_repeated_whitespace = (
+            text_without_repeated_whitespace.strip()
+        )
+        return stripped_text_without_repeated_whitespace
+
+    @abstractmethod
+    def _remove_symbols_implementation(self, text: str) -> str: ...
+
+    _whitespace_or_newline_capturing_group_name = "whitespace_or_newline"
+    _repeated_whitespace_pattern = re.compile(
+        rf"[ ]+(?P<{_whitespace_or_newline_capturing_group_name}> |\n)"
+    )
+
+    def _remove_repeated_whitespace(self, text: str) -> str:
+        return re.sub(
+            self._repeated_whitespace_pattern,
+            rf"\g<{self._whitespace_or_newline_capturing_group_name}>",
+            text,
+        )
+
+
+class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover):
+    # Unicode ranges for most emojis
+    SYMBOLS_REGEX = re.compile(
+        "["
+        "\U000020d0-\U000020ff"  # Combining Diacritical Marks for Symbols
+        "\U00002190-\U000021ff"  # Arrows
+        "\U00002300-\U000023ff"  # Miscellaneous Technical
+        "\U00002400-\U0000243f"  # Control Pictures
+        "\U00002440-\U0000245f"  # Optical Character Recognition
+        # WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
+        "\U00002460-\U0000249f"  # Enclosed Alphanumerics
+        # WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
+        "\U000024b0-\U000024ff"  # Enclosed Alphanumerics Extension
+        "\U00002500-\U0000257f"  # Box Drawing
+        "\U00002580-\U000025ff"  # Block Elements
+        "\U00002600-\U000026ff"  # Miscellaneous Symbols
+        "\U00002700-\U000027bf"  # Dingbats
+        "\U000027c0-\U000027ef"  # Miscellaneous Mathematical Symbols-A
+        "\U000027f0-\U000027ff"  # Supplemental Arrows-A
+        "\U00002800-\U000028ff"  # Braille Patterns
+        "\U00002900-\U0000297f"  # Supplemental Arrows-B
+        "\U00002980-\U000029ff"  # Miscellaneous Mathematical Symbols-B
+        "\U00002a00-\U00002aff"  # Supplemental Mathematical Operators
+        "\U00002b00-\U00002bff"  # Miscellaneous Symbols and Arrows
+        "\U00003000-\U0000303f"  # CJK Symbols and Punctuation
+        # WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
+        "\U00003200-\U000032ff"  # Enclosed CJK Letters and Months
+        "\U0001f000-\U0001f02f"  # Mahjong Tiles
+        "\U0001f030-\U0001f09f"  # Domino Tiles
+        "\U0001f0a0-\U0001f0ff"  # Playing cards
+        # WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
+        "\U0001f100-\U0001f1ff"  # Enclosed Alphanumeric Supplement
+        # WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
+        "\U0001f200-\U0001f2ff"  # Enclosed Ideographic Supplement
+        "\U0001f300-\U0001f5ff"  # Miscellaneous Symbols and Pictographs
+        "\U0001f600-\U0001f64f"  # Emoticons
+        "\U0001f650-\U0001f67f"  # Ornamental Dingbats
+        "\U0001f680-\U0001f6ff"  # transport & map symbols
+        "\U0001f700-\U0001f77f"  # alchemical symbols
+        "\U0001f780-\U0001f7ff"  # Geometric Shapes
+        "\U0001f800-\U0001f8ff"  # Supplemental Arrows-C
+        "\U0001f900-\U0001f9ff"  # Supplemental Symbols and Pictographs
+        "\U0001fa00-\U0001fa6f"  # Chess Symbols
+        "\U0001fa70-\U0001faff"  # Symbols and Pictographs Extended-A
+        "\U0001fb00-\U0001fbff"  # Symbols for Legacy Computing
+        "\U000e0000-\U000e007f"  # Tags (used for modifying emojis with region modifiers in particular)
+        "\U0000200d"  # Zero Width Joiner (ZWJ)
+        "\U0000fe0f"  # Variation Selector-16 (emoji style)
+        "\U0000fe0e"  # Variation Selector-15 (text style)
+        "]+"
+    )
+
+    def _remove_symbols_implementation(self, text: str) -> str:
+        return self.SYMBOLS_REGEX.sub(r"", text)
--- a/d3lta/faissd3lta.py
+++ b/d3lta/faissd3lta.py
@ -1,23 +1,26 @@
-from functools import wraps
 import os
 import re
 import time
+from functools import wraps
 from typing import Union
-import demoji
+
 import faiss
 import fasttext
-from gensim.utils import deaccent
 import networkx as nx
 import numpy as np
 import pandas as pd
-from polyleven import levenshtein
 import requests
 import tensorflow as tf
 import tensorflow_hub as hub
-import tensorflow_text
-from tqdm.contrib.concurrent import thread_map
+
+# import `tensorflow_text` ensures that some ops required by the USE model are available at runtime
+import tensorflow_text  # noqa: F401 # pylint: disable=unused-import
+from gensim.utils import deaccent
+from polyleven import levenshtein
 from tqdm.auto import trange
-import networkx as nx
+from tqdm.contrib.concurrent import thread_map
+
+from d3lta.emojis_remover import EmojisRemover, ExplicitUnicodeBlocksEmojisRemover


 def timeit(func):
@ -31,7 +34,7 @@ def timeit(func):
        if total_time < 60:
            print(f"<<< End {func.__name__}, Took: {total_time:.4f} sec")
        else:
-            print(f"<<< End {func.__name__}, Took:{np.round((total_time)/60, 1)} min")
+            print(f"<<< End {func.__name__}, Took:{np.round((total_time) / 60, 1)} min")
        return result

    return timeit_wrapper
@ -49,16 +52,17 @@ def grouper(iterable, n):


 def preprocess_text(
-    s,
-    lower=True,
-    remove_accents=True,
-    remove_urls=True,
-    remove_mentions=True,
-    remove_emojis=True,
-    remove_hashtags_frontend=False,
-    remove_twitter_cropend=False,
-    replace_newline_characters=True,
-    remove_punctuation=False,
+    s: str | list[str] | set[str] | frozenset[str] | pd.Series,
+    lower: bool = True,
+    remove_accents: bool = True,
+    remove_urls: bool = True,
+    remove_mentions: bool = True,
+    remove_emojis: bool = True,
+    remove_hashtags_frontend: bool = False,
+    remove_twitter_cropend: bool = False,
+    replace_newline_characters: bool = True,
+    remove_punctuation: bool = False,
+    emojis_remover: EmojisRemover | None = None,
 ):
    """
    clean a list-like of strings, performing all the following treatments by default
@ -68,15 +72,21 @@ def preprocess_text(
        remove_accents (bool, optional): deaccent the text. Defaults to True.
        remove_urls (bool, optional): remove urls from the text. Defaults to True.
        remove_mentions (bool, optional): remove mentions from the text. Defaults to True.
-        remove_emojis (bool, optional): remove emojis from the text. Defaults to True.
+        remove_emojis (bool, optional): remove emojis and other pictograms from the text. Defaults to True.
        remove_hashtags_frontend (bool, optional): remove leading and ending hashtags from the text. Defaults to False.
        remove_twitter_cropend (bool, optional): remove Twitter-added "…" character at the end of messages that are too long. Defaults to False.
        replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True.
        remove_punctuation (bool, optional): remove punctuation from the text, be careful, it will remove # of hashtags too. Defaults to False.
+        emojis_remover (EmojisRemover, optional):
+            if provided, overrides the default engine used for emojis matching and removal.
+            Has no effect if `remove_emojis` is set to False.
    """
    if s is None:
        s = ""

+    if emojis_remover is None:
+        emojis_remover = ExplicitUnicodeBlocksEmojisRemover()
+
    assert isinstance(s, (str, list, pd.Series, set, frozenset))

    if isinstance(s, str):
@ -104,7 +114,7 @@ def preprocess_text(
            for msg in s
        ]
    if remove_emojis:
-        s = [demoji.replace(msg, "").strip() for msg in s]
+        s = [emojis_remover.remove_symbols(msg).strip() for msg in s]

    if remove_hashtags_frontend:
        if (not remove_urls) or (not remove_mentions):
@ -145,22 +155,22 @@ def prepare_dataset(dataset: Union[pd.Series, pd.DataFrame], min_size_txt: int =
    Returns:
        dataset (pd.DataFrame): The same input dataset with new columns added (text_grapheme, text_to_embed, text_language_detect), containing the preprocessed texts for 3 delta method.
    """
-    assert isinstance(
-        dataset, (pd.Series, pd.DataFrame)
-    ), "dataset must be a pd.Series or a pd.DataFrame"
+    assert isinstance(dataset, (pd.Series, pd.DataFrame)), (
+        "dataset must be a pd.Series or a pd.DataFrame"
+    )

-    assert dataset.index.nunique() == len(
-        dataset
-    ), "dataset must be indexed with unique indices"
+    assert dataset.index.nunique() == len(dataset), (
+        "dataset must be indexed with unique indices"
+    )

-    assert all(
-        [isinstance(i, str) for i in dataset.index]
-    ), "dataset indices must be `str`"
+    assert all([isinstance(i, str) for i in dataset.index]), (
+        "dataset indices must be `str`"
+    )

    if isinstance(dataset, pd.DataFrame):
-        assert (
-            "original" in dataset.columns
-        ), "when dataset is a pd.DataFrame, it must have a column named 'original'"
+        assert "original" in dataset.columns, (
+            "when dataset is a pd.DataFrame, it must have a column named 'original'"
+        )

    if isinstance(dataset, pd.Series):
        dataset = dataset.to_frame("original")
@ -221,7 +231,7 @@ def prepare_dataset(dataset: Union[pd.Series, pd.DataFrame], min_size_txt: int =

    if min_size_txt is not None:
        print(
-            f'Removing {(dataset["text_grapheme"].str.len() < min_size_txt).sum()} short texts over {len(dataset)} sentences...'
+            f"Removing {(dataset['text_grapheme'].str.len() < min_size_txt).sum()} short texts over {len(dataset)} sentences..."
        )
        dataset = dataset.loc[dataset["text_grapheme"].str.len() >= min_size_txt]
        print("Done.")
@ -246,9 +256,9 @@ def compute_language(
    Returns:
        dataset (pd.DataFrame): The same input dataset with column 'language' added containing the results of language detection.
    """
-    assert (
-        "text_language_detect" in dataset.columns
-    ), "you need to have a column text_language_detect to detect language"
+    assert "text_language_detect" in dataset.columns, (
+        "you need to have a column text_language_detect to detect language"
+    )

    if fasttext_model is None:
        if os.path.exists("lid.176.ftz"):
@ -413,9 +423,9 @@ def find_matches(

 def similarity_levenshtein(pair):
    s1, s2 = pair
-    assert (
-        min(len(s1), len(s2)) > 0
-    ), "one text_grapheme is None and levenshtein can't be retrieved"
+    assert min(len(s1), len(s2)) > 0, (
+        "one text_grapheme is None and levenshtein can't be retrieved"
+    )
    return 1 - levenshtein(s1, s2) / max(len(s1), len(s2))


--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,25 +1,32 @@
-[tool.poetry]
+[project]
 name = "d3lta"
-version = "1.0.0"
-description = "d3lta package"
+version = "1.0.2"
+description = "A library for detecting verbatim-duplicated contents within a vast amount of documents"
 readme = "README.md"
-authors = ["Viginum"]
+authors = [{ name = "VIGINUM" }]
+license = { file = "LICENSE.txt" }
+requires-python = ">=3.10"
+dependencies = [
+    "faiss-cpu==1.9.0.post1",
+    "fasttext==0.9.3",
+    "gensim==4.3.3",
+    "networkx==2.8.8",
+    "pandas==2.2.3",
+    "polyleven==0.8",
+    "scipy==1.12.0",
+    "tensorflow==2.18.0",
+    "tensorflow-hub==0.16.1",
+    "tensorflow-text==2.18.1",
+    "tqdm==4.67.1",
+]

-[tool.poetry.dependencies]
-python = "^3.10"
-demoji = "1.1.0"
-faiss-cpu = "1.9.0.post1"
-fasttext = "0.9.3"
-gensim = "4.3.3"
-networkx = "2.8.8"
-pandas = "2.2.3"
-polyleven = "0.8"
-scipy = "1.12.0"
-tensorflow = "2.18.0"
-tensorflow-hub = "0.16.1"
-tensorflow-text = "2.18.1"
-tqdm = "4.67.1"
+
+[tool.poetry.group.dev]
+optional = true
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.3.5"

 [build-system]
-requires = ["setuptools","poetry-core"]
+requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
--- a/setup.py
+++ b/setup.py
@ -1,4 +0,0 @@
-#!/usr/bin/env python
-from setuptools import setup
-
-setup()
--- a/static/graph.gif
+++ b/static/graph.gif
--- a/tests/emojis_remover_test.py
+++ b/tests/emojis_remover_test.py
@ -0,0 +1,105 @@
+import os
+from typing import Any, Generator, TypeVar
+
+import pytest
+from get_unicode_emojis_list import (
+    EMOJI_TESTFILE_FILENAME,
+    get_all_emojis_from_latest_unicode_emojis_specification_with_download,
+)
+
+import d3lta.emojis_remover
+
+
+@pytest.fixture(
+    name="emojis_remover",
+    params=[
+        d3lta.emojis_remover.ExplicitUnicodeBlocksEmojisRemover,
+    ],
+)
+def fixture_emojis_remover(
+    request: pytest.FixtureRequest,
+) -> d3lta.emojis_remover.EmojisRemover:
+    return request.param()
+
+
+T = TypeVar("T")
+FixtureWithTeardown = Generator[T, Any, Any]
+
+
+@pytest.fixture(name="latest_unicode_public_emojis", scope="session")
+def fixture_latest_unicode_public_emojis() -> FixtureWithTeardown[list[str]]:
+    """Latest list of emojis from the unicode consortium"""
+    emojis = get_all_emojis_from_latest_unicode_emojis_specification_with_download()
+
+    print(f"Retrieved {len(emojis)} unique emojis")
+
+    yield emojis
+
+    os.remove(EMOJI_TESTFILE_FILENAME)
+
+
+ACCEPTABLE_ASCII_SYMBOLS = [
+    "*",
+    "#",
+    "©",
+    "®",
+    "™",
+    "‼",
+    "⁇",
+    "⁈",
+    "⁉",
+    "ℹ",
+    "0",
+    "1",
+    "2",
+    "3",
+    "4",
+    "5",
+    "6",
+    "7",
+    "8",
+    "9",
+]
+
+
+def is_acceptable_ascii_symbol(text: str):
+    return text in ACCEPTABLE_ASCII_SYMBOLS
+
+
+def test_removes_all_emojis_in_latest_unicode_emojis_specification(
+    latest_unicode_public_emojis: list[str],
+    emojis_remover: d3lta.emojis_remover.EmojisRemover,
+):
+    for i, emoji in enumerate(latest_unicode_public_emojis):
+        replacement = emojis_remover.remove_symbols(emoji)
+        assert len(replacement) == 0 or is_acceptable_ascii_symbol(replacement), (
+            f"Error at index {i}: {emoji} yields {replacement} ({replacement.encode('unicode-escape')})"
+        )
+
+
+@pytest.fixture(name="sample_text")
+def fixture_sample_text() -> str:
+    return """
+The representatives of the French People, formed into a National Assembly, considering ignorance, forgetfulness or contempt of the rights of man to be the only causes of public misfortunes and the corruption of Governments, have resolved to set forth, in a solemn Declaration, the natural, unalienable and sacred rights of man, to the end that this Declaration, constantly present to all members of the body politic, may remind them unceasingly of their rights and their duties; to the end that the acts of the legislative power and those of the executive power, since they may be continually compared with the aim of every political institution, may thereby be the more respected; to the end that the demands of the citizens, founded henceforth on simple and incontestable principles, may always be directed toward the maintenance of the Constitution and the happiness of all.
+
+In consequence whereof, the National Assembly recognises and declares, in the presence and under the auspices of the Supreme Being, the following Rights of Man and of the Citizen.
+""".strip()
+
+
+@pytest.fixture(name="sample_text_with_emojipasta")
+def fixture_sample_text_with_emojipasta() -> str:
+    return """
+The representatives of the French 🥖🥐🍟  People, 🚷  formed 🈸  into a National 🏞️  Assembly, 🧑‍🏭  considering 🤔  ignorance, 🤷‍♀️🤷‍♂️  forgetfulness or contempt of the rights ↪️🧎‍➡️  of man 👳👨‍🔬👳👨‍🔬👳👨‍🔬  to be the only causes 🎗️  of public 🚋🚅📢  misfortunes and the corruption of Governments, have 🈶  resolved to set 📐  forth, in a solemn Declaration, the natural, unalienable and sacred ❤️‍🔥  rights 👉  of man, 👨‍👩‍👧‍👧👨‍❤️‍💋‍👨👩‍❤️‍👨🚶‍♂️‍➡️👨‍🦳👨‍👩‍👦‍👦🚣‍♂️👨‍🦽‍➡️👞🧛‍♂️  to the end 🔚  that this 🙂  Declaration, constantly present 🎁  to all members of the body 🖐️👀🤟🦷👁️🤚🖕👄👅🤲  politic, may remind them unceasingly of their rights 👩‍🦽‍➡️  and their duties; to the end 🔚  that the acts of the legislative power 🔋🔌  and those of the executive power, ✊  since they 👩‍👩‍👦‍👦  may be continually compared with the aim of every political institution, may thereby be the more ➕  respected; to the end 🔚  that the demands 🫴  of the citizens, founded henceforth on simple and incontestable principles, may always be directed 🎯  toward the maintenance of the Constitution and the happiness ☺️  of all.
+
+In consequence whereof, the National 🏞️  Assembly 👩‍🏭👨‍🏭  recognises and declares, in the presence and under 🌁🌁🌁  the auspices of the Supreme Being, 🐝  the following Rights 👨‍🦼‍➡️  of Man 👨‍🔬  and of the Citizen.
+""".strip()
+
+
+def test_on_text_sample(
+    emojis_remover: d3lta.emojis_remover.EmojisRemover,
+    sample_text_with_emojipasta: str,
+    sample_text: str
+):
+    assert emojis_remover.remove_symbols(
+        sample_text_with_emojipasta,
+    ) == sample_text
--- a/tests/faissd3lta_test.py
+++ b/tests/faissd3lta_test.py
@ -1,6 +1,4 @@
-import os
 import re
-import sys

 import pandas as pd
 import pytest
@ -26,13 +24,13 @@ def examples_dataset():
    ]


-def test_compute_language(examples_dataset):
+def test_compute_language(examples_dataset: list[str]):
    df_language = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
    df_language = compute_language(df_language)
    assert list(df_language["language"]) == ["fr", "fr", "fr", "en", "en", "fr"]


-def test_embedding_similarity(examples_dataset):
+def test_embedding_similarity(examples_dataset: list[str]):
    df_test = pd.DataFrame(
        examples_dataset,
        columns=["text_to_embed"],
@ -54,7 +52,7 @@ def test_embedding_similarity(examples_dataset):
    )


-def test_semantic_faiss(examples_dataset):
+def test_semantic_faiss(examples_dataset: list[str]):
    df = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
    df = compute_language(df)
    df_emb = compute_embeddings(
--- a/tests/get_unicode_emojis_list.py
+++ b/tests/get_unicode_emojis_list.py
@ -0,0 +1,43 @@
+# taken from https://gist.github.com/msenol86/44082269be46aa446ccda9d02202e523
+import os
+import re
+import urllib.request
+
+EMOJI_TESTFILE_FILENAME = "emoji-test.txt"
+EMOJI_DATA_URL = "https://unicode.org/Public/emoji/latest/emoji-test.txt"
+
+
+def download_latest_emoji_test_data() -> None:
+    with urllib.request.urlopen(EMOJI_DATA_URL) as emoji_data_request_response:
+        emoji_test_file = emoji_data_request_response.read()
+
+    with open(EMOJI_TESTFILE_FILENAME, "wb") as tmp_file:
+        tmp_file.write(emoji_test_file)
+
+
+def get_all_emojis_from_latest_unicode_emojis_specification_with_download() -> list[
+    str
+]:
+    if not os.path.exists(EMOJI_TESTFILE_FILENAME):
+        print(EMOJI_TESTFILE_FILENAME + " file not found. Downloading it ...")
+        download_latest_emoji_test_data()
+
+    emoji_matching_in_unicode_specification_v16_0_pattern = re.compile(
+        r"(?:minimally|fully)-qualified[ ]*# (?P<emoji>.*?) "
+    )
+
+    with open(EMOJI_TESTFILE_FILENAME, "r", encoding="utf8") as unicode_data:
+        unicode_data_rows = unicode_data.read()
+
+    def _deduplicate(items: list[str]):
+        return list(set(items))
+
+    emojis = _deduplicate(
+        emoji_matching_in_unicode_specification_v16_0_pattern.findall(unicode_data_rows)
+    )
+
+    return emojis
+
+
+if __name__ == "__main__":
+    print(get_all_emojis_from_latest_unicode_emojis_specification_with_download())
Автор	SHA1	Сообщение	Дата
Viginum-DataScientist-6	8df5772178	fix(devcontainer): reflect updates to pyproject.toml in #22	2025-07-31 18:50:36 +02:00
Viginum-DataScientist-6	a18992748e	refactor: convert [tool.poetry] to [project] - Use the recommended PEP621 syntax for the pyproject.toml	2025-07-31 13:39:03 +02:00
Viginum-DataScientist-6	045bd4becf	chore: remove unnecessary setup.py - poetry-core is sufficient for handling builds	2025-07-31 13:39:03 +02:00
Viginum-DataScientist-6	3cdea198bb	style: trim trailing whitespaces	2025-07-30 16:59:23 +02:00
Viginum-DataScientist-6	1bd593cf43	feat(ci): add guards for local debugging via nektos/act	2025-07-30 16:58:22 +02:00
Viginum-DataScientist-6	b41ab2ce19	chore: ignore debug event file for nektos/act	2025-07-30 16:58:22 +02:00
Viginum-DataScientist-6	1151e21254	feat(devcontainer): install prebuilt nektos/act executable - Remove the now-unneeded gh CLI devcontainer feature.	2025-07-30 16:56:56 +02:00
Viginum-DataScientist-6	b8fada79c2	chore: remove demoji-based emojis removal - Also remove demoji related tests and benchmarking code. - This speeds up the unit tests suite.	2025-07-30 14:42:04 +02:00
Viginum-DataScientist-6	95a07bd5a3	feat(devcontainer): add github.vscode-github-actions extension	2025-07-30 14:37:20 +02:00
Viginum-DataScientist-6	e0c747f43c	fix(ci): run pypa/build via pipx - Removes warning caused by running pip as root	2025-07-30 14:34:52 +02:00
Viginum-DataScientist-6	588f20cd4a	feat(ci): cache poetry dependencies	2025-07-30 14:33:18 +02:00
Viginum-DataScientist-6	a92770562b	feat(ci): install poetry via pipx Fixes the warning related to using the root environment's pip when install poetry via `pip install poetry`.	2025-07-30 14:33:18 +02:00
Viginum-DataScientist-6	991ed8141b	style: minor formatting and fix workflow name	2025-07-30 14:33:18 +02:00
Viginum-DataScientist-6	f5f71cca37	fix(ci): restrict release workflow to the main branch - Prevents creating releases from unprotected branches.	2025-07-30 14:29:06 +02:00
Viginum-DataScientist-6	b1d2b93c24	fix(ci): fix artifacts name	2025-07-29 18:17:12 +02:00
Viginum-DataScientist-6	71a76b0d3a	release: bump version 1.0.2	2025-07-29 18:17:12 +02:00
Viginum-DataScientist-6	ed3f0b9db3	feat(docs): switch to PyPI-based installation instructions Fixes #9.	2025-07-29 17:52:39 +02:00
Viginum-DataScientist-6	8999d23448	fix(docs): minor formatting	2025-07-29 17:52:39 +02:00
Viginum-DataScientist-6	427a873568	ci: run publish-* workflow on manual GitHub releases only - This setup is inspired by the setup in github.com/python-poetry/poetry. - Automatically publishing on tag pushes feels brittle, revert to a manual trigger to nre PyPI releases. - Move the test step to a new test workflow to ensure it is still run on each commit.	2025-07-29 17:29:20 +02:00
Viginum-DataScientist-6	0386589b46	style: remove emojis	2025-07-29 17:29:20 +02:00
Viginum-DataScientist-1	c589aebc41	Merge pull request #6 from VIGINUM-FR/Viginum-DataScientist-6-patch-standardize-README-display fix(docs): standardize README.md display	2025-07-29 17:03:33 +02:00
Viginum-DataScientist-6	56a1f07c1e	fix(docs): standardize README.md display Remove manual text centering Use standard Markdown syntax for headers	2025-07-29 13:40:08 +00:00
Viginum-DataScientist-1	fb7531405c	Merge pull request #5 from VIGINUM-FR/Viginum-DataScientist-6-docs-add-pypi-release-badge docs: add PyPI release badge	2025-07-29 15:33:28 +02:00
Viginum-DataScientist-6	80f12d6ee9	docs: add PyPI release badge	2025-07-29 13:20:46 +00:00
Viginum-DataScientist-1	eb1599ee10	Merge pull request #7 from VIGINUM-FR/ci-disable-testpypi-releases fix: disable automatic releases to test.pypi.org	2025-07-29 15:15:13 +02:00
Viginum-DataScientist-6	c7107aae1d	fix: disable automatic releases to test.pypi.org [test.]pypi.org reject uploads of distributions with an already existing version number. Publishing to testpypi on every commit therefore does not work in the current versioning setup since it leads to duplicate release versions.	2025-07-29 13:04:13 +00:00
Viginum-DataScientist-1	8979129306	Merge pull request #3 from VIGINUM-FR/fast-emoji-replace Increase emojis removal robustness and performances	2025-07-22 16:32:49 +02:00
Viginum-DataScientist-6	3a4ff9dcd7	fix(ci): Create new post version to enable CI upload to pypi/testpypi	2025-07-22 11:15:44 +02:00
Someone	6a889baf1a	✅ Add test on github ci	2025-05-27 15:22:14 +02:00
Viginum-DataScientist-6	aefc13ce31	build: add Dockerfile and devcontainer	2025-05-26 17:57:58 +02:00
Viginum-DataScientist-6	72244a8ade	fix: fix unicode ranges for emojis detection	2025-05-26 17:57:58 +02:00
Viginum-DataScientist-6	8cbd9d87a9	fix: add tests and benchmarking for emojis removal	2025-05-26 17:57:58 +02:00
Viginum-DataScientist-6	ad5e63da8b	refactor: extract emojis_remover with dependency injection - enables testing and benchmarking different implementations	2025-05-26 17:49:51 +02:00
Viginum-DataScientist-6	9d178deb74	fix: add type annotations	2025-05-26 17:49:51 +02:00
Viginum-DataScientist-6	b8b6e3940f	chore: remove unused imports and organize remaining	2025-05-26 17:49:51 +02:00
Viginum-DataScientist-6	8c342ea5c1	chore: minor formatting	2025-05-26 14:18:24 +02:00
Viginum-DataScientist-6	c1d3767d57	refactor: remove unused imports	2025-05-26 14:18:24 +02:00
Viginum-DataScientist-6	2d95fd97dd	fix: re-add demoji dependency for testing and comparison	2025-05-26 14:18:24 +02:00
MathisHammel	e641fb8f50	Added fast emoji support	2025-05-26 13:17:44 +02:00
Viginum-DataScientist-6	80b81896e8	docs: use more specific project description	2025-05-09 17:48:08 +02:00
Viginum-DataScientist-6	2ef2025595	docs: optimize demo gif asset file size should enable display on PyPI (PyPI only displays assets with a maximum file size of 10MB)	2025-05-09 17:39:26 +02:00
Viginum-DataScientist-6	2ca8288c07	docs: use direct link to raw asset for demo gif Should enable display of the image on PyPI	2025-05-09 17:01:28 +02:00
Viginum-DataScientist-6	f8d4bc1925	build: rename publish-to-test-pypi.yml to publish-to-pypi.yml	2025-05-09 16:32:32 +02:00
Viginum-DataScientist-6	6aec0029c8	build: create publish-to-pypi.yml action	2025-05-09 16:32:02 +02:00