chore: remove demoji-based emojis removal

- Also remove demoji related tests and benchmarking code.
    - This speeds up the unit tests suite.
Этот коммит содержится в:
Viginum-DataScientist-6 2025-07-30 09:25:32 +00:00 коммит произвёл Viginum-DataScientist-1
родитель 95a07bd5a3
Коммит b8fada79c2
4 изменённых файлов: 4 добавлений и 74 удалений

Просмотреть файл

@ -3,8 +3,6 @@ from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import final
import demoji
@dataclass
class EmojisRemover(ABC):
@ -96,8 +94,3 @@ class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover):
def _remove_symbols_implementation(self, text: str) -> str:
return self.SYMBOLS_REGEX.sub(r"", text)
class DemojiEmojisRemover(EmojisRemover):
def _remove_symbols_implementation(self, text: str) -> str:
return demoji.replace(text)

50
poetry.lock сгенерированный
Просмотреть файл

@ -155,21 +155,6 @@ files = [
]
markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""}
[[package]]
name = "demoji"
version = "1.1.0"
description = "Accurately remove and replace emojis in text strings"
optional = false
python-versions = ">=3.6"
groups = ["main"]
files = [
{file = "demoji-1.1.0-py3-none-any.whl", hash = "sha256:6d3256c909aea299e97fe984f827a2a060c2a8f8bfcbafa7ec9659967c5df50f"},
{file = "demoji-1.1.0.tar.gz", hash = "sha256:072efaeca725e6f63ab59d83abeb55b178842538ed9256455a82ebbd055ff216"},
]
[package.extras]
ujson = ["ujson"]
[[package]]
name = "exceptiongroup"
version = "1.3.0"
@ -1060,18 +1045,6 @@ files = [
{file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"},
]
[[package]]
name = "py-cpuinfo"
version = "9.0.0"
description = "Get CPU info with pure Python"
optional = false
python-versions = "*"
groups = ["dev"]
files = [
{file = "py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690"},
{file = "py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5"},
]
[[package]]
name = "pybind11"
version = "2.13.6"
@ -1125,27 +1098,6 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""}
[package.extras]
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "pytest-benchmark"
version = "5.1.0"
description = "A ``pytest`` fixture for benchmarking code. It will group the tests into rounds that are calibrated to the chosen timer."
optional = false
python-versions = ">=3.9"
groups = ["dev"]
files = [
{file = "pytest-benchmark-5.1.0.tar.gz", hash = "sha256:9ea661cdc292e8231f7cd4c10b0319e56a2118e2c09d9f50e1b3d150d2aca105"},
{file = "pytest_benchmark-5.1.0-py3-none-any.whl", hash = "sha256:922de2dfa3033c227c96da942d1878191afa135a29485fb942e85dff1c592c89"},
]
[package.dependencies]
py-cpuinfo = "*"
pytest = ">=8.1"
[package.extras]
aspect = ["aspectlib"]
elasticsearch = ["elasticsearch"]
histogram = ["pygal", "pygaljs", "setuptools"]
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
@ -1748,4 +1700,4 @@ files = [
[metadata]
lock-version = "2.1"
python-versions = "^3.10"
content-hash = "2a469cf6cd729d58a4315152a037a242fdc09dba63fe3adfe00bbb88c3f16863"
content-hash = "63a5c842aafa7166bcfbdd716b0d51a14f2df0827ad594e0f8d8bb3d74e7df54"

Просмотреть файл

@ -7,7 +7,6 @@ authors = ["Viginum"]
[tool.poetry.dependencies]
python = "^3.10"
demoji = "^1.1.0"
faiss-cpu = "1.9.0.post1"
fasttext = "0.9.3"
gensim = "4.3.3"
@ -25,7 +24,6 @@ optional = true
[tool.poetry.group.dev.dependencies]
pytest = "^8.3.5"
pytest-benchmark = "^5.1.0"
[build-system]
requires = ["setuptools", "poetry-core"]

Просмотреть файл

@ -6,9 +6,6 @@ from get_unicode_emojis_list import (
EMOJI_TESTFILE_FILENAME,
get_all_emojis_from_latest_unicode_emojis_specification_with_download,
)
from pytest_benchmark.fixture import (
BenchmarkFixture,
)
import d3lta.emojis_remover
@ -17,13 +14,6 @@ import d3lta.emojis_remover
name="emojis_remover",
params=[
d3lta.emojis_remover.ExplicitUnicodeBlocksEmojisRemover,
pytest.param(
d3lta.emojis_remover.DemojiEmojisRemover,
marks=pytest.mark.xfail(
reason="`demoji`'s detection engine does not detect all emojis in the Unicode specification",
strict=True,
),
),
],
)
def fixture_emojis_remover(
@ -108,11 +98,8 @@ In consequence whereof, the National 🏞️ Assembly 👩‍🏭👨‍🏭 r
def test_on_text_sample(
emojis_remover: d3lta.emojis_remover.EmojisRemover,
sample_text_with_emojipasta: str,
sample_text: str,
benchmark: BenchmarkFixture,
sample_text: str
):
processed = benchmark(
emojis_remover.remove_symbols,
assert emojis_remover.remove_symbols(
sample_text_with_emojipasta,
)
assert processed == sample_text
) == sample_text