зеркало из
https://github.com/VIGINUM-FR/D3lta.git
synced 2025-10-29 05:04:20 +02:00
Merge pull request #3 from VIGINUM-FR/fast-emoji-replace
Increase emojis removal robustness and performances
Этот коммит содержится в:
Коммит
8979129306
30
.devcontainer/Dockerfile
Обычный файл
30
.devcontainer/Dockerfile
Обычный файл
@ -0,0 +1,30 @@
|
||||
FROM mcr.microsoft.com/devcontainers/python:1-3.11-bookworm AS d3lta-prod
|
||||
|
||||
ENV PIP_DISABLE_PIP_VERSION_CHECK=on \
|
||||
PIP_DEFAULT_TIMEOUT=100 \
|
||||
\
|
||||
# Poetry
|
||||
# https://python-poetry.org/docs/configuration/#using-environment-variables
|
||||
POETRY_VERSION=2.1.1 \
|
||||
# make poetry install to this location
|
||||
POETRY_HOME="/opt/poetry" \
|
||||
# do not ask any interactive questions
|
||||
POETRY_NO_INTERACTION=1 \
|
||||
# never create virtual environments automatically
|
||||
POETRY_VIRTUALENVS_CREATE=false
|
||||
|
||||
RUN pip install --no-cache-dir --upgrade pip
|
||||
RUN pipx install poetry==${POETRY_VERSION}
|
||||
|
||||
WORKDIR /app
|
||||
COPY pyproject.toml poetry.lock setup.py README.md ./
|
||||
# pre-install dependencies
|
||||
RUN --mount=type=cache,target=/root/.cache poetry install --no-root
|
||||
|
||||
COPY notebooks /app/notebooks/
|
||||
COPY d3lta /app/d3lta/
|
||||
RUN --mount=type=cache,target=/root/.cache poetry install
|
||||
|
||||
FROM d3lta-prod AS d3lta-dev
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache poetry install --with dev
|
||||
9
.devcontainer/devcontainer.json
Обычный файл
9
.devcontainer/devcontainer.json
Обычный файл
@ -0,0 +1,9 @@
|
||||
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
|
||||
// README at: https://github.com/devcontainers/templates/tree/main/src/python
|
||||
{
|
||||
"name": "Python 3",
|
||||
"dockerFile": "./Dockerfile",
|
||||
"context": "..",
|
||||
// Features to add to the dev container. More info: https://containers.dev/features.
|
||||
// "features": {}
|
||||
}
|
||||
22
.github/workflows/publish-to-pypi.yml
поставляемый
22
.github/workflows/publish-to-pypi.yml
поставляемый
@ -4,8 +4,30 @@ name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
|
||||
on: push
|
||||
|
||||
jobs:
|
||||
unit-test:
|
||||
name: Run unit tests 🤾
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
persist-credentials: false
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install poetry
|
||||
poetry install --with dev
|
||||
- name: Build a binary wheel and a source tarball
|
||||
run:
|
||||
poetry run pytest
|
||||
|
||||
build:
|
||||
name: Build distribution 📦
|
||||
needs:
|
||||
- unit-test
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
|
||||
3
.gitignore
поставляемый
3
.gitignore
поставляемый
@ -227,4 +227,5 @@ pyrightconfig.json
|
||||
*.pkl
|
||||
*.ftz
|
||||
use_model_kaggle/
|
||||
include/
|
||||
include/
|
||||
.benchmarks
|
||||
103
d3lta/emojis_remover.py
Обычный файл
103
d3lta/emojis_remover.py
Обычный файл
@ -0,0 +1,103 @@
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import final
|
||||
|
||||
import demoji
|
||||
|
||||
|
||||
@dataclass
|
||||
class EmojisRemover(ABC):
|
||||
skip_postprocessing: bool = False
|
||||
|
||||
@final
|
||||
def remove_symbols(self, text: str) -> str:
|
||||
text_without_symbols = self._remove_symbols_implementation(text)
|
||||
if self.skip_postprocessing:
|
||||
return text_without_symbols
|
||||
|
||||
return self._postprocess(text_without_symbols)
|
||||
|
||||
def _postprocess(self, text: str) -> str:
|
||||
# text = self._remove_whitespace_before_newline(text)
|
||||
text_without_repeated_whitespace = self._remove_repeated_whitespace(text)
|
||||
stripped_text_without_repeated_whitespace = (
|
||||
text_without_repeated_whitespace.strip()
|
||||
)
|
||||
return stripped_text_without_repeated_whitespace
|
||||
|
||||
@abstractmethod
|
||||
def _remove_symbols_implementation(self, text: str) -> str: ...
|
||||
|
||||
_whitespace_or_newline_capturing_group_name = "whitespace_or_newline"
|
||||
_repeated_whitespace_pattern = re.compile(
|
||||
rf"[ ]+(?P<{_whitespace_or_newline_capturing_group_name}> |\n)"
|
||||
)
|
||||
|
||||
def _remove_repeated_whitespace(self, text: str) -> str:
|
||||
return re.sub(
|
||||
self._repeated_whitespace_pattern,
|
||||
rf"\g<{self._whitespace_or_newline_capturing_group_name}>",
|
||||
text,
|
||||
)
|
||||
|
||||
|
||||
class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover):
|
||||
# Unicode ranges for most emojis
|
||||
SYMBOLS_REGEX = re.compile(
|
||||
"["
|
||||
"\U000020d0-\U000020ff" # Combining Diacritical Marks for Symbols
|
||||
"\U00002190-\U000021ff" # Arrows
|
||||
"\U00002300-\U000023ff" # Miscellaneous Technical
|
||||
"\U00002400-\U0000243f" # Control Pictures
|
||||
"\U00002440-\U0000245f" # Optical Character Recognition
|
||||
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
||||
"\U00002460-\U0000249f" # Enclosed Alphanumerics
|
||||
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
||||
"\U000024b0-\U000024ff" # Enclosed Alphanumerics Extension
|
||||
"\U00002500-\U0000257f" # Box Drawing
|
||||
"\U00002580-\U000025ff" # Block Elements
|
||||
"\U00002600-\U000026ff" # Miscellaneous Symbols
|
||||
"\U00002700-\U000027bf" # Dingbats
|
||||
"\U000027c0-\U000027ef" # Miscellaneous Mathematical Symbols-A
|
||||
"\U000027f0-\U000027ff" # Supplemental Arrows-A
|
||||
"\U00002800-\U000028ff" # Braille Patterns
|
||||
"\U00002900-\U0000297f" # Supplemental Arrows-B
|
||||
"\U00002980-\U000029ff" # Miscellaneous Mathematical Symbols-B
|
||||
"\U00002a00-\U00002aff" # Supplemental Mathematical Operators
|
||||
"\U00002b00-\U00002bff" # Miscellaneous Symbols and Arrows
|
||||
"\U00003000-\U0000303f" # CJK Symbols and Punctuation
|
||||
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
||||
"\U00003200-\U000032ff" # Enclosed CJK Letters and Months
|
||||
"\U0001f000-\U0001f02f" # Mahjong Tiles
|
||||
"\U0001f030-\U0001f09f" # Domino Tiles
|
||||
"\U0001f0a0-\U0001f0ff" # Playing cards
|
||||
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
||||
"\U0001f100-\U0001f1ff" # Enclosed Alphanumeric Supplement
|
||||
# WARNING: should we simply be transforming those enclosed characters to their plain, non-enclosed counterpart?
|
||||
"\U0001f200-\U0001f2ff" # Enclosed Ideographic Supplement
|
||||
"\U0001f300-\U0001f5ff" # Miscellaneous Symbols and Pictographs
|
||||
"\U0001f600-\U0001f64f" # Emoticons
|
||||
"\U0001f650-\U0001f67f" # Ornamental Dingbats
|
||||
"\U0001f680-\U0001f6ff" # transport & map symbols
|
||||
"\U0001f700-\U0001f77f" # alchemical symbols
|
||||
"\U0001f780-\U0001f7ff" # Geometric Shapes
|
||||
"\U0001f800-\U0001f8ff" # Supplemental Arrows-C
|
||||
"\U0001f900-\U0001f9ff" # Supplemental Symbols and Pictographs
|
||||
"\U0001fa00-\U0001fa6f" # Chess Symbols
|
||||
"\U0001fa70-\U0001faff" # Symbols and Pictographs Extended-A
|
||||
"\U0001fb00-\U0001fbff" # Symbols for Legacy Computing
|
||||
"\U000e0000-\U000e007f" # Tags (used for modifying emojis with region modifiers in particular)
|
||||
"\U0000200d" # Zero Width Joiner (ZWJ)
|
||||
"\U0000fe0f" # Variation Selector-16 (emoji style)
|
||||
"\U0000fe0e" # Variation Selector-15 (text style)
|
||||
"]+"
|
||||
)
|
||||
|
||||
def _remove_symbols_implementation(self, text: str) -> str:
|
||||
return self.SYMBOLS_REGEX.sub(r"", text)
|
||||
|
||||
|
||||
class DemojiEmojisRemover(EmojisRemover):
|
||||
def _remove_symbols_implementation(self, text: str) -> str:
|
||||
return demoji.replace(text)
|
||||
@ -1,23 +1,26 @@
|
||||
from functools import wraps
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from functools import wraps
|
||||
from typing import Union
|
||||
import demoji
|
||||
|
||||
import faiss
|
||||
import fasttext
|
||||
from gensim.utils import deaccent
|
||||
import networkx as nx
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from polyleven import levenshtein
|
||||
import requests
|
||||
import tensorflow as tf
|
||||
import tensorflow_hub as hub
|
||||
import tensorflow_text
|
||||
from tqdm.contrib.concurrent import thread_map
|
||||
|
||||
# import `tensorflow_text` ensures that some ops required by the USE model are available at runtime
|
||||
import tensorflow_text # noqa: F401 # pylint: disable=unused-import
|
||||
from gensim.utils import deaccent
|
||||
from polyleven import levenshtein
|
||||
from tqdm.auto import trange
|
||||
import networkx as nx
|
||||
from tqdm.contrib.concurrent import thread_map
|
||||
|
||||
from d3lta.emojis_remover import EmojisRemover, ExplicitUnicodeBlocksEmojisRemover
|
||||
|
||||
|
||||
def timeit(func):
|
||||
@ -31,7 +34,7 @@ def timeit(func):
|
||||
if total_time < 60:
|
||||
print(f"<<< End {func.__name__}, Took: {total_time:.4f} sec")
|
||||
else:
|
||||
print(f"<<< End {func.__name__}, Took:{np.round((total_time)/60, 1)} min")
|
||||
print(f"<<< End {func.__name__}, Took:{np.round((total_time) / 60, 1)} min")
|
||||
return result
|
||||
|
||||
return timeit_wrapper
|
||||
@ -49,16 +52,17 @@ def grouper(iterable, n):
|
||||
|
||||
|
||||
def preprocess_text(
|
||||
s,
|
||||
lower=True,
|
||||
remove_accents=True,
|
||||
remove_urls=True,
|
||||
remove_mentions=True,
|
||||
remove_emojis=True,
|
||||
remove_hashtags_frontend=False,
|
||||
remove_twitter_cropend=False,
|
||||
replace_newline_characters=True,
|
||||
remove_punctuation=False,
|
||||
s: str | list[str] | set[str] | frozenset[str] | pd.Series,
|
||||
lower: bool = True,
|
||||
remove_accents: bool = True,
|
||||
remove_urls: bool = True,
|
||||
remove_mentions: bool = True,
|
||||
remove_emojis: bool = True,
|
||||
remove_hashtags_frontend: bool = False,
|
||||
remove_twitter_cropend: bool = False,
|
||||
replace_newline_characters: bool = True,
|
||||
remove_punctuation: bool = False,
|
||||
emojis_remover: EmojisRemover | None = None,
|
||||
):
|
||||
"""
|
||||
clean a list-like of strings, performing all the following treatments by default
|
||||
@ -68,15 +72,21 @@ def preprocess_text(
|
||||
remove_accents (bool, optional): deaccent the text. Defaults to True.
|
||||
remove_urls (bool, optional): remove urls from the text. Defaults to True.
|
||||
remove_mentions (bool, optional): remove mentions from the text. Defaults to True.
|
||||
remove_emojis (bool, optional): remove emojis from the text. Defaults to True.
|
||||
remove_emojis (bool, optional): remove emojis and other pictograms from the text. Defaults to True.
|
||||
remove_hashtags_frontend (bool, optional): remove leading and ending hashtags from the text. Defaults to False.
|
||||
remove_twitter_cropend (bool, optional): remove Twitter-added "…" character at the end of messages that are too long. Defaults to False.
|
||||
replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True.
|
||||
remove_punctuation (bool, optional): remove punctuation from the text, be careful, it will remove # of hashtags too. Defaults to False.
|
||||
emojis_remover (EmojisRemover, optional):
|
||||
if provided, overrides the default engine used for emojis matching and removal.
|
||||
Has no effect if `remove_emojis` is set to False.
|
||||
"""
|
||||
if s is None:
|
||||
s = ""
|
||||
|
||||
if emojis_remover is None:
|
||||
emojis_remover = ExplicitUnicodeBlocksEmojisRemover()
|
||||
|
||||
assert isinstance(s, (str, list, pd.Series, set, frozenset))
|
||||
|
||||
if isinstance(s, str):
|
||||
@ -104,7 +114,7 @@ def preprocess_text(
|
||||
for msg in s
|
||||
]
|
||||
if remove_emojis:
|
||||
s = [demoji.replace(msg, "").strip() for msg in s]
|
||||
s = [emojis_remover.remove_symbols(msg).strip() for msg in s]
|
||||
|
||||
if remove_hashtags_frontend:
|
||||
if (not remove_urls) or (not remove_mentions):
|
||||
@ -145,22 +155,22 @@ def prepare_dataset(dataset: Union[pd.Series, pd.DataFrame], min_size_txt: int =
|
||||
Returns:
|
||||
dataset (pd.DataFrame): The same input dataset with new columns added (text_grapheme, text_to_embed, text_language_detect), containing the preprocessed texts for 3 delta method.
|
||||
"""
|
||||
assert isinstance(
|
||||
dataset, (pd.Series, pd.DataFrame)
|
||||
), "dataset must be a pd.Series or a pd.DataFrame"
|
||||
assert isinstance(dataset, (pd.Series, pd.DataFrame)), (
|
||||
"dataset must be a pd.Series or a pd.DataFrame"
|
||||
)
|
||||
|
||||
assert dataset.index.nunique() == len(
|
||||
dataset
|
||||
), "dataset must be indexed with unique indices"
|
||||
assert dataset.index.nunique() == len(dataset), (
|
||||
"dataset must be indexed with unique indices"
|
||||
)
|
||||
|
||||
assert all(
|
||||
[isinstance(i, str) for i in dataset.index]
|
||||
), "dataset indices must be `str`"
|
||||
assert all([isinstance(i, str) for i in dataset.index]), (
|
||||
"dataset indices must be `str`"
|
||||
)
|
||||
|
||||
if isinstance(dataset, pd.DataFrame):
|
||||
assert (
|
||||
"original" in dataset.columns
|
||||
), "when dataset is a pd.DataFrame, it must have a column named 'original'"
|
||||
assert "original" in dataset.columns, (
|
||||
"when dataset is a pd.DataFrame, it must have a column named 'original'"
|
||||
)
|
||||
|
||||
if isinstance(dataset, pd.Series):
|
||||
dataset = dataset.to_frame("original")
|
||||
@ -221,7 +231,7 @@ def prepare_dataset(dataset: Union[pd.Series, pd.DataFrame], min_size_txt: int =
|
||||
|
||||
if min_size_txt is not None:
|
||||
print(
|
||||
f'Removing {(dataset["text_grapheme"].str.len() < min_size_txt).sum()} short texts over {len(dataset)} sentences...'
|
||||
f"Removing {(dataset['text_grapheme'].str.len() < min_size_txt).sum()} short texts over {len(dataset)} sentences..."
|
||||
)
|
||||
dataset = dataset.loc[dataset["text_grapheme"].str.len() >= min_size_txt]
|
||||
print("Done.")
|
||||
@ -246,9 +256,9 @@ def compute_language(
|
||||
Returns:
|
||||
dataset (pd.DataFrame): The same input dataset with column 'language' added containing the results of language detection.
|
||||
"""
|
||||
assert (
|
||||
"text_language_detect" in dataset.columns
|
||||
), "you need to have a column text_language_detect to detect language"
|
||||
assert "text_language_detect" in dataset.columns, (
|
||||
"you need to have a column text_language_detect to detect language"
|
||||
)
|
||||
|
||||
if fasttext_model is None:
|
||||
if os.path.exists("lid.176.ftz"):
|
||||
@ -413,9 +423,9 @@ def find_matches(
|
||||
|
||||
def similarity_levenshtein(pair):
|
||||
s1, s2 = pair
|
||||
assert (
|
||||
min(len(s1), len(s2)) > 0
|
||||
), "one text_grapheme is None and levenshtein can't be retrieved"
|
||||
assert min(len(s1), len(s2)) > 0, (
|
||||
"one text_grapheme is None and levenshtein can't be retrieved"
|
||||
)
|
||||
return 1 - levenshtein(s1, s2) / max(len(s1), len(s2))
|
||||
|
||||
|
||||
|
||||
1751
poetry.lock
сгенерированный
Обычный файл
1751
poetry.lock
сгенерированный
Обычный файл
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,13 +1,13 @@
|
||||
[tool.poetry]
|
||||
name = "d3lta"
|
||||
version = "1.0.0"
|
||||
version = "1.0.0.post2"
|
||||
description = "A library for detecting verbatim-duplicated contents within a vast amount of documents"
|
||||
readme = "README.md"
|
||||
authors = ["Viginum"]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.10"
|
||||
demoji = "1.1.0"
|
||||
demoji = "^1.1.0"
|
||||
faiss-cpu = "1.9.0.post1"
|
||||
fasttext = "0.9.3"
|
||||
gensim = "4.3.3"
|
||||
@ -20,6 +20,13 @@ tensorflow-hub = "0.16.1"
|
||||
tensorflow-text = "2.18.1"
|
||||
tqdm = "4.67.1"
|
||||
|
||||
[tool.poetry.group.dev]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^8.3.5"
|
||||
pytest-benchmark = "^5.1.0"
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools","poetry-core"]
|
||||
requires = ["setuptools", "poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
118
tests/emojis_remover_test.py
Обычный файл
118
tests/emojis_remover_test.py
Обычный файл
@ -0,0 +1,118 @@
|
||||
import os
|
||||
from typing import Any, Generator, TypeVar
|
||||
|
||||
import pytest
|
||||
from get_unicode_emojis_list import (
|
||||
EMOJI_TESTFILE_FILENAME,
|
||||
get_all_emojis_from_latest_unicode_emojis_specification_with_download,
|
||||
)
|
||||
from pytest_benchmark.fixture import (
|
||||
BenchmarkFixture,
|
||||
)
|
||||
|
||||
import d3lta.emojis_remover
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
name="emojis_remover",
|
||||
params=[
|
||||
d3lta.emojis_remover.ExplicitUnicodeBlocksEmojisRemover,
|
||||
pytest.param(
|
||||
d3lta.emojis_remover.DemojiEmojisRemover,
|
||||
marks=pytest.mark.xfail(
|
||||
reason="`demoji`'s detection engine does not detect all emojis in the Unicode specification",
|
||||
strict=True,
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def fixture_emojis_remover(
|
||||
request: pytest.FixtureRequest,
|
||||
) -> d3lta.emojis_remover.EmojisRemover:
|
||||
return request.param()
|
||||
|
||||
|
||||
T = TypeVar("T")
|
||||
FixtureWithTeardown = Generator[T, Any, Any]
|
||||
|
||||
|
||||
@pytest.fixture(name="latest_unicode_public_emojis", scope="session")
|
||||
def fixture_latest_unicode_public_emojis() -> FixtureWithTeardown[list[str]]:
|
||||
"""Latest list of emojis from the unicode consortium"""
|
||||
emojis = get_all_emojis_from_latest_unicode_emojis_specification_with_download()
|
||||
|
||||
print(f"Retrieved {len(emojis)} unique emojis")
|
||||
|
||||
yield emojis
|
||||
|
||||
os.remove(EMOJI_TESTFILE_FILENAME)
|
||||
|
||||
|
||||
ACCEPTABLE_ASCII_SYMBOLS = [
|
||||
"*",
|
||||
"#",
|
||||
"©",
|
||||
"®",
|
||||
"™",
|
||||
"‼",
|
||||
"⁇",
|
||||
"⁈",
|
||||
"⁉",
|
||||
"ℹ",
|
||||
"0",
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"7",
|
||||
"8",
|
||||
"9",
|
||||
]
|
||||
|
||||
|
||||
def is_acceptable_ascii_symbol(text: str):
|
||||
return text in ACCEPTABLE_ASCII_SYMBOLS
|
||||
|
||||
|
||||
def test_removes_all_emojis_in_latest_unicode_emojis_specification(
|
||||
latest_unicode_public_emojis: list[str],
|
||||
emojis_remover: d3lta.emojis_remover.EmojisRemover,
|
||||
):
|
||||
for i, emoji in enumerate(latest_unicode_public_emojis):
|
||||
replacement = emojis_remover.remove_symbols(emoji)
|
||||
assert len(replacement) == 0 or is_acceptable_ascii_symbol(replacement), (
|
||||
f"Error at index {i}: {emoji} yields {replacement} ({replacement.encode('unicode-escape')})"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(name="sample_text")
|
||||
def fixture_sample_text() -> str:
|
||||
return """
|
||||
The representatives of the French People, formed into a National Assembly, considering ignorance, forgetfulness or contempt of the rights of man to be the only causes of public misfortunes and the corruption of Governments, have resolved to set forth, in a solemn Declaration, the natural, unalienable and sacred rights of man, to the end that this Declaration, constantly present to all members of the body politic, may remind them unceasingly of their rights and their duties; to the end that the acts of the legislative power and those of the executive power, since they may be continually compared with the aim of every political institution, may thereby be the more respected; to the end that the demands of the citizens, founded henceforth on simple and incontestable principles, may always be directed toward the maintenance of the Constitution and the happiness of all.
|
||||
|
||||
In consequence whereof, the National Assembly recognises and declares, in the presence and under the auspices of the Supreme Being, the following Rights of Man and of the Citizen.
|
||||
""".strip()
|
||||
|
||||
|
||||
@pytest.fixture(name="sample_text_with_emojipasta")
|
||||
def fixture_sample_text_with_emojipasta() -> str:
|
||||
return """
|
||||
The representatives of the French 🥖🥐🍟 People, 🚷 formed 🈸 into a National 🏞️ Assembly, 🧑🏭 considering 🤔 ignorance, 🤷♀️🤷♂️ forgetfulness or contempt of the rights ↪️🧎➡️ of man 👳👨🔬👳👨🔬👳👨🔬 to be the only causes 🎗️ of public 🚋🚅📢 misfortunes and the corruption of Governments, have 🈶 resolved to set 📐 forth, in a solemn Declaration, the natural, unalienable and sacred ❤️🔥 rights 👉 of man, 👨👩👧👧👨❤️💋👨👩❤️👨🚶♂️➡️👨🦳👨👩👦👦🚣♂️👨🦽➡️👞🧛♂️ to the end 🔚 that this 🙂 Declaration, constantly present 🎁 to all members of the body 🖐️👀🤟🦷👁️🤚🖕👄👅🤲 politic, may remind them unceasingly of their rights 👩🦽➡️ and their duties; to the end 🔚 that the acts of the legislative power 🔋🔌 and those of the executive power, ✊ since they 👩👩👦👦 may be continually compared with the aim of every political institution, may thereby be the more ➕ respected; to the end 🔚 that the demands 🫴 of the citizens, founded henceforth on simple and incontestable principles, may always be directed 🎯 toward the maintenance of the Constitution and the happiness ☺️ of all.
|
||||
|
||||
In consequence whereof, the National 🏞️ Assembly 👩🏭👨🏭 recognises and declares, in the presence and under 🌁🌁🌁 the auspices of the Supreme Being, 🐝 the following Rights 👨🦼➡️ of Man 👨🔬 and of the Citizen.
|
||||
""".strip()
|
||||
|
||||
|
||||
def test_on_text_sample(
|
||||
emojis_remover: d3lta.emojis_remover.EmojisRemover,
|
||||
sample_text_with_emojipasta: str,
|
||||
sample_text: str,
|
||||
benchmark: BenchmarkFixture,
|
||||
):
|
||||
processed = benchmark(
|
||||
emojis_remover.remove_symbols,
|
||||
sample_text_with_emojipasta,
|
||||
)
|
||||
assert processed == sample_text
|
||||
@ -1,6 +1,4 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
@ -26,13 +24,13 @@ def examples_dataset():
|
||||
]
|
||||
|
||||
|
||||
def test_compute_language(examples_dataset):
|
||||
def test_compute_language(examples_dataset: list[str]):
|
||||
df_language = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
|
||||
df_language = compute_language(df_language)
|
||||
assert list(df_language["language"]) == ["fr", "fr", "fr", "en", "en", "fr"]
|
||||
|
||||
|
||||
def test_embedding_similarity(examples_dataset):
|
||||
def test_embedding_similarity(examples_dataset: list[str]):
|
||||
df_test = pd.DataFrame(
|
||||
examples_dataset,
|
||||
columns=["text_to_embed"],
|
||||
@ -54,7 +52,7 @@ def test_embedding_similarity(examples_dataset):
|
||||
)
|
||||
|
||||
|
||||
def test_semantic_faiss(examples_dataset):
|
||||
def test_semantic_faiss(examples_dataset: list[str]):
|
||||
df = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
|
||||
df = compute_language(df)
|
||||
df_emb = compute_embeddings(
|
||||
|
||||
43
tests/get_unicode_emojis_list.py
Обычный файл
43
tests/get_unicode_emojis_list.py
Обычный файл
@ -0,0 +1,43 @@
|
||||
# taken from https://gist.github.com/msenol86/44082269be46aa446ccda9d02202e523
|
||||
import os
|
||||
import re
|
||||
import urllib.request
|
||||
|
||||
EMOJI_TESTFILE_FILENAME = "emoji-test.txt"
|
||||
EMOJI_DATA_URL = "https://unicode.org/Public/emoji/latest/emoji-test.txt"
|
||||
|
||||
|
||||
def download_latest_emoji_test_data() -> None:
|
||||
with urllib.request.urlopen(EMOJI_DATA_URL) as emoji_data_request_response:
|
||||
emoji_test_file = emoji_data_request_response.read()
|
||||
|
||||
with open(EMOJI_TESTFILE_FILENAME, "wb") as tmp_file:
|
||||
tmp_file.write(emoji_test_file)
|
||||
|
||||
|
||||
def get_all_emojis_from_latest_unicode_emojis_specification_with_download() -> list[
|
||||
str
|
||||
]:
|
||||
if not os.path.exists(EMOJI_TESTFILE_FILENAME):
|
||||
print(EMOJI_TESTFILE_FILENAME + " file not found. Downloading it ...")
|
||||
download_latest_emoji_test_data()
|
||||
|
||||
emoji_matching_in_unicode_specification_v16_0_pattern = re.compile(
|
||||
r"(?:minimally|fully)-qualified[ ]*# (?P<emoji>.*?) "
|
||||
)
|
||||
|
||||
with open(EMOJI_TESTFILE_FILENAME, "r", encoding="utf8") as unicode_data:
|
||||
unicode_data_rows = unicode_data.read()
|
||||
|
||||
def _deduplicate(items: list[str]):
|
||||
return list(set(items))
|
||||
|
||||
emojis = _deduplicate(
|
||||
emoji_matching_in_unicode_specification_v16_0_pattern.findall(unicode_data_rows)
|
||||
)
|
||||
|
||||
return emojis
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(get_all_emojis_from_latest_unicode_emojis_specification_with_download())
|
||||
Загрузка…
x
Ссылка в новой задаче
Block a user