Merge pull request #3 from VIGINUM-FR/fast-emoji-replace

Increase emojis removal robustness and performances
Этот коммит содержится в:
Viginum-DataScientist-1 2025-07-22 16:32:49 +02:00 коммит произвёл GitHub
родитель 80b81896e8 3a4ff9dcd7
Коммит 8979129306
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
11 изменённых файлов: 2140 добавлений и 48 удалений

30
.devcontainer/Dockerfile Обычный файл
Просмотреть файл

@ -0,0 +1,30 @@
FROM mcr.microsoft.com/devcontainers/python:1-3.11-bookworm AS d3lta-prod
ENV PIP_DISABLE_PIP_VERSION_CHECK=on \
PIP_DEFAULT_TIMEOUT=100 \
\
# Poetry
# https://python-poetry.org/docs/configuration/#using-environment-variables
POETRY_VERSION=2.1.1 \
# make poetry install to this location
POETRY_HOME="/opt/poetry" \
# do not ask any interactive questions
POETRY_NO_INTERACTION=1 \
# never create virtual environments automatically
POETRY_VIRTUALENVS_CREATE=false
RUN pip install --no-cache-dir --upgrade pip
RUN pipx install poetry==${POETRY_VERSION}
WORKDIR /app
COPY pyproject.toml poetry.lock setup.py README.md ./
# pre-install dependencies
RUN --mount=type=cache,target=/root/.cache poetry install --no-root
COPY notebooks /app/notebooks/
COPY d3lta /app/d3lta/
RUN --mount=type=cache,target=/root/.cache poetry install
FROM d3lta-prod AS d3lta-dev
RUN --mount=type=cache,target=/root/.cache poetry install --with dev

9
.devcontainer/devcontainer.json Обычный файл
Просмотреть файл

@ -0,0 +1,9 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/python
{
"name": "Python 3",
"dockerFile": "./Dockerfile",
"context": "..",
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {}
}

22
.github/workflows/publish-to-pypi.yml поставляемый
Просмотреть файл

@ -4,8 +4,30 @@ name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
on: push
jobs:
unit-test:
name: Run unit tests 🤾
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install dependencies
run: |
pip install poetry
poetry install --with dev
- name: Build a binary wheel and a source tarball
run:
poetry run pytest
build:
name: Build distribution 📦
needs:
- unit-test
runs-on: ubuntu-latest
steps:

3
.gitignore поставляемый
Просмотреть файл

@ -227,4 +227,5 @@ pyrightconfig.json
*.pkl
*.ftz
use_model_kaggle/
include/
include/
.benchmarks

103
d3lta/emojis_remover.py Обычный файл
Просмотреть файл

@ -0,0 +1,103 @@
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import final
import demoji
@dataclass
class EmojisRemover(ABC):
skip_postprocessing: bool = False
@final
def remove_symbols(self, text: str) -> str:
text_without_symbols = self._remove_symbols_implementation(text)
if self.skip_postprocessing:
return text_without_symbols
return self._postprocess(text_without_symbols)
def _postprocess(self, text: str) -> str:
# text = self._remove_whitespace_before_newline(text)
text_without_repeated_whitespace = self._remove_repeated_whitespace(text)
stripped_text_without_repeated_whitespace = (
text_without_repeated_whitespace.strip()
)
return stripped_text_without_repeated_whitespace
@abstractmethod
def _remove_symbols_implementation(self, text: str) -> str: ...
_whitespace_or_newline_capturing_group_name = "whitespace_or_newline"
_repeated_whitespace_pattern = re.compile(
rf"[ ]+(?P<{_whitespace_or_newline_capturing_group_name}> |\n)"
)
def _remove_repeated_whitespace(self, text: str) -> str:
return re.sub(
self._repeated_whitespace_pattern,
rf"\g<{self._whitespace_or_newline_capturing_group_name}>",
text,
)
class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover):
# Unicode ranges for most emojis
SYMBOLS_REGEX = re.compile(
"["
"\U000020d0-\U000020ff" # Combining Diacritical Marks for Symbols
"\U00002190-\U000021ff" # Arrows
"\U00002300-\U000023ff" # Miscellaneous Technical
"\U00002400-\U0000243f" # Control Pictures
"\U00002440-\U0000245f" # Optical Character Recognition
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
"\U00002460-\U0000249f" # Enclosed Alphanumerics
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
"\U000024b0-\U000024ff" # Enclosed Alphanumerics Extension
"\U00002500-\U0000257f" # Box Drawing
"\U00002580-\U000025ff" # Block Elements
"\U00002600-\U000026ff" # Miscellaneous Symbols
"\U00002700-\U000027bf" # Dingbats
"\U000027c0-\U000027ef" # Miscellaneous Mathematical Symbols-A
"\U000027f0-\U000027ff" # Supplemental Arrows-A
"\U00002800-\U000028ff" # Braille Patterns
"\U00002900-\U0000297f" # Supplemental Arrows-B
"\U00002980-\U000029ff" # Miscellaneous Mathematical Symbols-B
"\U00002a00-\U00002aff" # Supplemental Mathematical Operators
"\U00002b00-\U00002bff" # Miscellaneous Symbols and Arrows
"\U00003000-\U0000303f" # CJK Symbols and Punctuation
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
"\U00003200-\U000032ff" # Enclosed CJK Letters and Months
"\U0001f000-\U0001f02f" # Mahjong Tiles
"\U0001f030-\U0001f09f" # Domino Tiles
"\U0001f0a0-\U0001f0ff" # Playing cards
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
"\U0001f100-\U0001f1ff" # Enclosed Alphanumeric Supplement
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
"\U0001f200-\U0001f2ff" # Enclosed Ideographic Supplement
"\U0001f300-\U0001f5ff" # Miscellaneous Symbols and Pictographs
"\U0001f600-\U0001f64f" # Emoticons
"\U0001f650-\U0001f67f" # Ornamental Dingbats
"\U0001f680-\U0001f6ff" # transport & map symbols
"\U0001f700-\U0001f77f" # alchemical symbols
"\U0001f780-\U0001f7ff" # Geometric Shapes
"\U0001f800-\U0001f8ff" # Supplemental Arrows-C
"\U0001f900-\U0001f9ff" # Supplemental Symbols and Pictographs
"\U0001fa00-\U0001fa6f" # Chess Symbols
"\U0001fa70-\U0001faff" # Symbols and Pictographs Extended-A
"\U0001fb00-\U0001fbff" # Symbols for Legacy Computing
"\U000e0000-\U000e007f" # Tags (used for modifying emojis with region modifiers in particular)
"\U0000200d" # Zero Width Joiner (ZWJ)
"\U0000fe0f" # Variation Selector-16 (emoji style)
"\U0000fe0e" # Variation Selector-15 (text style)
"]+"
)
def _remove_symbols_implementation(self, text: str) -> str:
return self.SYMBOLS_REGEX.sub(r"", text)
class DemojiEmojisRemover(EmojisRemover):
def _remove_symbols_implementation(self, text: str) -> str:
return demoji.replace(text)

Просмотреть файл

@ -1,23 +1,26 @@
from functools import wraps
import os
import re
import time
from functools import wraps
from typing import Union
import demoji
import faiss
import fasttext
from gensim.utils import deaccent
import networkx as nx
import numpy as np
import pandas as pd
from polyleven import levenshtein
import requests
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
from tqdm.contrib.concurrent import thread_map
# import `tensorflow_text` ensures that some ops required by the USE model are available at runtime
import tensorflow_text # noqa: F401 # pylint: disable=unused-import
from gensim.utils import deaccent
from polyleven import levenshtein
from tqdm.auto import trange
import networkx as nx
from tqdm.contrib.concurrent import thread_map
from d3lta.emojis_remover import EmojisRemover, ExplicitUnicodeBlocksEmojisRemover
def timeit(func):
@ -31,7 +34,7 @@ def timeit(func):
if total_time < 60:
print(f"<<< End {func.__name__}, Took: {total_time:.4f} sec")
else:
print(f"<<< End {func.__name__}, Took:{np.round((total_time)/60, 1)} min")
print(f"<<< End {func.__name__}, Took:{np.round((total_time) / 60, 1)} min")
return result
return timeit_wrapper
@ -49,16 +52,17 @@ def grouper(iterable, n):
def preprocess_text(
s,
lower=True,
remove_accents=True,
remove_urls=True,
remove_mentions=True,
remove_emojis=True,
remove_hashtags_frontend=False,
remove_twitter_cropend=False,
replace_newline_characters=True,
remove_punctuation=False,
s: str | list[str] | set[str] | frozenset[str] | pd.Series,
lower: bool = True,
remove_accents: bool = True,
remove_urls: bool = True,
remove_mentions: bool = True,
remove_emojis: bool = True,
remove_hashtags_frontend: bool = False,
remove_twitter_cropend: bool = False,
replace_newline_characters: bool = True,
remove_punctuation: bool = False,
emojis_remover: EmojisRemover | None = None,
):
"""
clean a list-like of strings, performing all the following treatments by default
@ -68,15 +72,21 @@ def preprocess_text(
remove_accents (bool, optional): deaccent the text. Defaults to True.
remove_urls (bool, optional): remove urls from the text. Defaults to True.
remove_mentions (bool, optional): remove mentions from the text. Defaults to True.
remove_emojis (bool, optional): remove emojis from the text. Defaults to True.
remove_emojis (bool, optional): remove emojis and other pictograms from the text. Defaults to True.
remove_hashtags_frontend (bool, optional): remove leading and ending hashtags from the text. Defaults to False.
remove_twitter_cropend (bool, optional): remove Twitter-added "" character at the end of messages that are too long. Defaults to False.
replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True.
remove_punctuation (bool, optional): remove punctuation from the text, be careful, it will remove # of hashtags too. Defaults to False.
emojis_remover (EmojisRemover, optional):
if provided, overrides the default engine used for emojis matching and removal.
Has no effect if `remove_emojis` is set to False.
"""
if s is None:
s = ""
if emojis_remover is None:
emojis_remover = ExplicitUnicodeBlocksEmojisRemover()
assert isinstance(s, (str, list, pd.Series, set, frozenset))
if isinstance(s, str):
@ -104,7 +114,7 @@ def preprocess_text(
for msg in s
]
if remove_emojis:
s = [demoji.replace(msg, "").strip() for msg in s]
s = [emojis_remover.remove_symbols(msg).strip() for msg in s]
if remove_hashtags_frontend:
if (not remove_urls) or (not remove_mentions):
@ -145,22 +155,22 @@ def prepare_dataset(dataset: Union[pd.Series, pd.DataFrame], min_size_txt: int =
Returns:
dataset (pd.DataFrame): The same input dataset with new columns added (text_grapheme, text_to_embed, text_language_detect), containing the preprocessed texts for 3 delta method.
"""
assert isinstance(
dataset, (pd.Series, pd.DataFrame)
), "dataset must be a pd.Series or a pd.DataFrame"
assert isinstance(dataset, (pd.Series, pd.DataFrame)), (
"dataset must be a pd.Series or a pd.DataFrame"
)
assert dataset.index.nunique() == len(
dataset
), "dataset must be indexed with unique indices"
assert dataset.index.nunique() == len(dataset), (
"dataset must be indexed with unique indices"
)
assert all(
[isinstance(i, str) for i in dataset.index]
), "dataset indices must be `str`"
assert all([isinstance(i, str) for i in dataset.index]), (
"dataset indices must be `str`"
)
if isinstance(dataset, pd.DataFrame):
assert (
"original" in dataset.columns
), "when dataset is a pd.DataFrame, it must have a column named 'original'"
assert "original" in dataset.columns, (
"when dataset is a pd.DataFrame, it must have a column named 'original'"
)
if isinstance(dataset, pd.Series):
dataset = dataset.to_frame("original")
@ -221,7 +231,7 @@ def prepare_dataset(dataset: Union[pd.Series, pd.DataFrame], min_size_txt: int =
if min_size_txt is not None:
print(
f'Removing {(dataset["text_grapheme"].str.len() < min_size_txt).sum()} short texts over {len(dataset)} sentences...'
f"Removing {(dataset['text_grapheme'].str.len() < min_size_txt).sum()} short texts over {len(dataset)} sentences..."
)
dataset = dataset.loc[dataset["text_grapheme"].str.len() >= min_size_txt]
print("Done.")
@ -246,9 +256,9 @@ def compute_language(
Returns:
dataset (pd.DataFrame): The same input dataset with column 'language' added containing the results of language detection.
"""
assert (
"text_language_detect" in dataset.columns
), "you need to have a column text_language_detect to detect language"
assert "text_language_detect" in dataset.columns, (
"you need to have a column text_language_detect to detect language"
)
if fasttext_model is None:
if os.path.exists("lid.176.ftz"):
@ -413,9 +423,9 @@ def find_matches(
def similarity_levenshtein(pair):
s1, s2 = pair
assert (
min(len(s1), len(s2)) > 0
), "one text_grapheme is None and levenshtein can't be retrieved"
assert min(len(s1), len(s2)) > 0, (
"one text_grapheme is None and levenshtein can't be retrieved"
)
return 1 - levenshtein(s1, s2) / max(len(s1), len(s2))

1751
poetry.lock сгенерированный Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,13 +1,13 @@
[tool.poetry]
name = "d3lta"
version = "1.0.0"
version = "1.0.0.post2"
description = "A library for detecting verbatim-duplicated contents within a vast amount of documents"
readme = "README.md"
authors = ["Viginum"]
[tool.poetry.dependencies]
python = "^3.10"
demoji = "1.1.0"
demoji = "^1.1.0"
faiss-cpu = "1.9.0.post1"
fasttext = "0.9.3"
gensim = "4.3.3"
@ -20,6 +20,13 @@ tensorflow-hub = "0.16.1"
tensorflow-text = "2.18.1"
tqdm = "4.67.1"
[tool.poetry.group.dev]
optional = true
[tool.poetry.group.dev.dependencies]
pytest = "^8.3.5"
pytest-benchmark = "^5.1.0"
[build-system]
requires = ["setuptools","poetry-core"]
requires = ["setuptools", "poetry-core"]
build-backend = "poetry.core.masonry.api"

118
tests/emojis_remover_test.py Обычный файл
Просмотреть файл

@ -0,0 +1,118 @@
import os
from typing import Any, Generator, TypeVar
import pytest
from get_unicode_emojis_list import (
EMOJI_TESTFILE_FILENAME,
get_all_emojis_from_latest_unicode_emojis_specification_with_download,
)
from pytest_benchmark.fixture import (
BenchmarkFixture,
)
import d3lta.emojis_remover
@pytest.fixture(
name="emojis_remover",
params=[
d3lta.emojis_remover.ExplicitUnicodeBlocksEmojisRemover,
pytest.param(
d3lta.emojis_remover.DemojiEmojisRemover,
marks=pytest.mark.xfail(
reason="`demoji`'s detection engine does not detect all emojis in the Unicode specification",
strict=True,
),
),
],
)
def fixture_emojis_remover(
request: pytest.FixtureRequest,
) -> d3lta.emojis_remover.EmojisRemover:
return request.param()
T = TypeVar("T")
FixtureWithTeardown = Generator[T, Any, Any]
@pytest.fixture(name="latest_unicode_public_emojis", scope="session")
def fixture_latest_unicode_public_emojis() -> FixtureWithTeardown[list[str]]:
"""Latest list of emojis from the unicode consortium"""
emojis = get_all_emojis_from_latest_unicode_emojis_specification_with_download()
print(f"Retrieved {len(emojis)} unique emojis")
yield emojis
os.remove(EMOJI_TESTFILE_FILENAME)
ACCEPTABLE_ASCII_SYMBOLS = [
"*",
"#",
"©",
"®",
"",
"",
"",
"",
"",
"",
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
]
def is_acceptable_ascii_symbol(text: str):
return text in ACCEPTABLE_ASCII_SYMBOLS
def test_removes_all_emojis_in_latest_unicode_emojis_specification(
latest_unicode_public_emojis: list[str],
emojis_remover: d3lta.emojis_remover.EmojisRemover,
):
for i, emoji in enumerate(latest_unicode_public_emojis):
replacement = emojis_remover.remove_symbols(emoji)
assert len(replacement) == 0 or is_acceptable_ascii_symbol(replacement), (
f"Error at index {i}: {emoji} yields {replacement} ({replacement.encode('unicode-escape')})"
)
@pytest.fixture(name="sample_text")
def fixture_sample_text() -> str:
return """
The representatives of the French People, formed into a National Assembly, considering ignorance, forgetfulness or contempt of the rights of man to be the only causes of public misfortunes and the corruption of Governments, have resolved to set forth, in a solemn Declaration, the natural, unalienable and sacred rights of man, to the end that this Declaration, constantly present to all members of the body politic, may remind them unceasingly of their rights and their duties; to the end that the acts of the legislative power and those of the executive power, since they may be continually compared with the aim of every political institution, may thereby be the more respected; to the end that the demands of the citizens, founded henceforth on simple and incontestable principles, may always be directed toward the maintenance of the Constitution and the happiness of all.
In consequence whereof, the National Assembly recognises and declares, in the presence and under the auspices of the Supreme Being, the following Rights of Man and of the Citizen.
""".strip()
@pytest.fixture(name="sample_text_with_emojipasta")
def fixture_sample_text_with_emojipasta() -> str:
return """
The representatives of the French 🥖🥐🍟 People, 🚷 formed 🈸 into a National 🏞 Assembly, 🧑🏭 considering 🤔 ignorance, 🤷🤷 forgetfulness or contempt of the rights 🧎 of man 👳👨🔬👳👨🔬👳👨🔬 to be the only causes 🎗 of public 🚋🚅📢 misfortunes and the corruption of Governments, have 🈶 resolved to set 📐 forth, in a solemn Declaration, the natural, unalienable and sacred 🔥 rights 👉 of man, 👨👩👧👧👨💋👨👩👨🚶👨🦳👨👩👦👦🚣👨🦽👞🧛 to the end 🔚 that this 🙂 Declaration, constantly present 🎁 to all members of the body 🖐👀🤟🦷👁🤚🖕👄👅🤲 politic, may remind them unceasingly of their rights 👩🦽 and their duties; to the end 🔚 that the acts of the legislative power 🔋🔌 and those of the executive power, since they 👩👩👦👦 may be continually compared with the aim of every political institution, may thereby be the more respected; to the end 🔚 that the demands 🫴 of the citizens, founded henceforth on simple and incontestable principles, may always be directed 🎯 toward the maintenance of the Constitution and the happiness of all.
In consequence whereof, the National 🏞 Assembly 👩🏭👨🏭 recognises and declares, in the presence and under 🌁🌁🌁 the auspices of the Supreme Being, 🐝 the following Rights 👨🦼 of Man 👨🔬 and of the Citizen.
""".strip()
def test_on_text_sample(
emojis_remover: d3lta.emojis_remover.EmojisRemover,
sample_text_with_emojipasta: str,
sample_text: str,
benchmark: BenchmarkFixture,
):
processed = benchmark(
emojis_remover.remove_symbols,
sample_text_with_emojipasta,
)
assert processed == sample_text

Просмотреть файл

@ -1,6 +1,4 @@
import os
import re
import sys
import pandas as pd
import pytest
@ -26,13 +24,13 @@ def examples_dataset():
]
def test_compute_language(examples_dataset):
def test_compute_language(examples_dataset: list[str]):
df_language = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
df_language = compute_language(df_language)
assert list(df_language["language"]) == ["fr", "fr", "fr", "en", "en", "fr"]
def test_embedding_similarity(examples_dataset):
def test_embedding_similarity(examples_dataset: list[str]):
df_test = pd.DataFrame(
examples_dataset,
columns=["text_to_embed"],
@ -54,7 +52,7 @@ def test_embedding_similarity(examples_dataset):
)
def test_semantic_faiss(examples_dataset):
def test_semantic_faiss(examples_dataset: list[str]):
df = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
df = compute_language(df)
df_emb = compute_embeddings(

43
tests/get_unicode_emojis_list.py Обычный файл
Просмотреть файл

@ -0,0 +1,43 @@
# taken from https://gist.github.com/msenol86/44082269be46aa446ccda9d02202e523
import os
import re
import urllib.request
EMOJI_TESTFILE_FILENAME = "emoji-test.txt"
EMOJI_DATA_URL = "https://unicode.org/Public/emoji/latest/emoji-test.txt"
def download_latest_emoji_test_data() -> None:
with urllib.request.urlopen(EMOJI_DATA_URL) as emoji_data_request_response:
emoji_test_file = emoji_data_request_response.read()
with open(EMOJI_TESTFILE_FILENAME, "wb") as tmp_file:
tmp_file.write(emoji_test_file)
def get_all_emojis_from_latest_unicode_emojis_specification_with_download() -> list[
str
]:
if not os.path.exists(EMOJI_TESTFILE_FILENAME):
print(EMOJI_TESTFILE_FILENAME + " file not found. Downloading it ...")
download_latest_emoji_test_data()
emoji_matching_in_unicode_specification_v16_0_pattern = re.compile(
r"(?:minimally|fully)-qualified[ ]*# (?P<emoji>.*?) "
)
with open(EMOJI_TESTFILE_FILENAME, "r", encoding="utf8") as unicode_data:
unicode_data_rows = unicode_data.read()
def _deduplicate(items: list[str]):
return list(set(items))
emojis = _deduplicate(
emoji_matching_in_unicode_specification_v16_0_pattern.findall(unicode_data_rows)
)
return emojis
if __name__ == "__main__":
print(get_all_emojis_from_latest_unicode_emojis_specification_with_download())