Сравнить коммиты

...

14 Коммитов
v1.0.2 ... main

Автор SHA1 Сообщение Дата
Viginum-DataScientist-6
8df5772178 fix(devcontainer): reflect updates to pyproject.toml in #22 2025-07-31 18:50:36 +02:00
Viginum-DataScientist-6
a18992748e refactor: convert [tool.poetry] to [project]
- Use the recommended PEP621 syntax for the pyproject.toml
2025-07-31 13:39:03 +02:00
Viginum-DataScientist-6
045bd4becf chore: remove unnecessary setup.py
- poetry-core is sufficient for handling builds
2025-07-31 13:39:03 +02:00
Viginum-DataScientist-6
3cdea198bb style: trim trailing whitespaces 2025-07-30 16:59:23 +02:00
Viginum-DataScientist-6
1bd593cf43 feat(ci): add guards for local debugging via nektos/act 2025-07-30 16:58:22 +02:00
Viginum-DataScientist-6
b41ab2ce19 chore: ignore debug event file for nektos/act 2025-07-30 16:58:22 +02:00
Viginum-DataScientist-6
1151e21254 feat(devcontainer): install prebuilt nektos/act executable
- Remove the now-unneeded gh CLI devcontainer feature.
2025-07-30 16:56:56 +02:00
Viginum-DataScientist-6
b8fada79c2 chore: remove demoji-based emojis removal
- Also remove demoji related tests and benchmarking code.
    - This speeds up the unit tests suite.
2025-07-30 14:42:04 +02:00
Viginum-DataScientist-6
95a07bd5a3 feat(devcontainer): add github.vscode-github-actions extension 2025-07-30 14:37:20 +02:00
Viginum-DataScientist-6
e0c747f43c fix(ci): run pypa/build via pipx
- Removes warning caused by running pip as root
2025-07-30 14:34:52 +02:00
Viginum-DataScientist-6
588f20cd4a feat(ci): cache poetry dependencies 2025-07-30 14:33:18 +02:00
Viginum-DataScientist-6
a92770562b feat(ci): install poetry via pipx
Fixes the warning related to using the root environment's pip when install poetry via `pip install poetry`.
2025-07-30 14:33:18 +02:00
Viginum-DataScientist-6
991ed8141b style: minor formatting and fix workflow name 2025-07-30 14:33:18 +02:00
Viginum-DataScientist-6
f5f71cca37 fix(ci): restrict release workflow to the main branch
- Prevents creating releases from unprotected branches.
2025-07-30 14:29:06 +02:00
10 изменённых файлов: 89 добавлений и 122 удалений

Просмотреть файл

@ -17,9 +17,9 @@ RUN pip install --no-cache-dir --upgrade pip
RUN pipx install poetry==${POETRY_VERSION} RUN pipx install poetry==${POETRY_VERSION}
WORKDIR /app WORKDIR /app
COPY pyproject.toml poetry.lock setup.py README.md ./ COPY pyproject.toml poetry.lock README.md LICENSE.txt ./
# pre-install dependencies # pre-install dependencies
RUN --mount=type=cache,target=/root/.cache poetry install --no-root RUN --mount=type=cache,target=/root/.cache poetry install --no-root
COPY notebooks /app/notebooks/ COPY notebooks /app/notebooks/
COPY d3lta /app/d3lta/ COPY d3lta /app/d3lta/
@ -27,4 +27,8 @@ RUN --mount=type=cache,target=/root/.cache poetry install
FROM d3lta-prod AS d3lta-dev FROM d3lta-prod AS d3lta-dev
RUN --mount=type=cache,target=/root/.cache poetry install --with dev RUN --mount=type=cache,target=/root/.cache poetry install --with dev
# install nektos/act as specified in https://nektosact.com/installation/index.html#bash-script
# the -b flag specifies the target directory (cf. https://github.com/nektos/act/blob/61396d8085a9d812cebf94fa954f5938d48bf2b9/install.sh#L13)
RUN curl --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash -s -- -b /usr/bin

Просмотреть файл

@ -4,6 +4,14 @@
"name": "Python 3", "name": "Python 3",
"dockerFile": "./Dockerfile", "dockerFile": "./Dockerfile",
"context": "..", "context": "..",
// Features to add to the dev container. More info: https://containers.dev/features. "customizations": {
// "features": {} "vscode": {
} "extensions": [
"github.vscode-github-actions"
]
}
},
"features": {
"ghcr.io/devcontainers/features/docker-in-docker:2": {}
}
}

24
.github/workflows/publish-to-pypi.yml поставляемый
Просмотреть файл

@ -5,30 +5,28 @@ on:
release: release:
types: [published] types: [published]
env:
ACT: false # env.ACT == true when running inside nektos/act
jobs: jobs:
build: build:
name: Build distribution name: Build distribution
# based on https://stackoverflow.com/a/74318141
if: ${{ github.event.release.target_commitish == 'main'}}
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
persist-credentials: false persist-credentials: false
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: "3.x" python-version: "3.x"
- name: Install pypa/build
run: >-
python3 -m
pip install
build
--user
- name: Build a binary wheel and a source tarball - name: Build a binary wheel and a source tarball
run: python3 -m build run: pipx run build
- name: Store the distribution packages - name: Store the distribution packages
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
@ -37,7 +35,7 @@ jobs:
path: dist/ path: dist/
if-no-files-found: error if-no-files-found: error
# taken from https://github.com/python-poetry/poetry/blob/b580e8aa4fbce53569420e7b42568dfd9e73519f/.github/workflows/release.yaml # taken from https://github.com/python-poetry/poetry/blob/b580e8aa4fbce53569420e7b42568dfd9e73519f/.github/workflows/release.yaml
upload-built-distribution-to-github-release: upload-built-distribution-to-github-release:
name: Upload (GitHub) name: Upload (GitHub)
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -57,6 +55,8 @@ jobs:
path: dist/ path: dist/
- run: gh release upload "${TAG_NAME}" dist/*.{tar.gz,whl} - run: gh release upload "${TAG_NAME}" dist/*.{tar.gz,whl}
# skip step when debugging locally via nektos/act
if: ${{ !env.ACT }}
env: env:
GH_TOKEN: ${{ github.token }} GH_TOKEN: ${{ github.token }}
TAG_NAME: ${{ github.event.release.tag_name }} TAG_NAME: ${{ github.event.release.tag_name }}
@ -79,6 +79,8 @@ jobs:
path: dist/ path: dist/
- name: Publish distribution to PyPI - name: Publish distribution to PyPI
# skip step when debugging locally via nektos/act
if: ${{ !env.ACT }}
uses: pypa/gh-action-pypi-publish@release/v1 uses: pypa/gh-action-pypi-publish@release/v1
with: with:
print-hash: true print-hash: true

19
.github/workflows/test.yml поставляемый
Просмотреть файл

@ -1,4 +1,4 @@
name: Publish Python distribution to PyPI and TestPyPI name: Run tests
on: push on: push
@ -11,17 +11,18 @@ jobs:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
persist-credentials: false persist-credentials: false
- name: Install poetry
run: pipx install poetry
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: "3.11" python-version: "3.11"
cache: poetry
- name: Install dependencies - name: Install dependencies
run: | run: poetry install --with dev
pip install poetry
poetry install --with dev
- name: Run tests - name: Run tests
run: run: poetry run pytest
poetry run pytest

3
.gitignore поставляемый
Просмотреть файл

@ -228,4 +228,5 @@ pyrightconfig.json
*.ftz *.ftz
use_model_kaggle/ use_model_kaggle/
include/ include/
.benchmarks .benchmarks
.act-event.json

Просмотреть файл

@ -3,8 +3,6 @@ from abc import ABC, abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
from typing import final from typing import final
import demoji
@dataclass @dataclass
class EmojisRemover(ABC): class EmojisRemover(ABC):
@ -96,8 +94,3 @@ class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover):
def _remove_symbols_implementation(self, text: str) -> str: def _remove_symbols_implementation(self, text: str) -> str:
return self.SYMBOLS_REGEX.sub(r"", text) return self.SYMBOLS_REGEX.sub(r"", text)
class DemojiEmojisRemover(EmojisRemover):
def _remove_symbols_implementation(self, text: str) -> str:
return demoji.replace(text)

75
poetry.lock сгенерированный
Просмотреть файл

@ -155,21 +155,6 @@ files = [
] ]
markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""} markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""}
[[package]]
name = "demoji"
version = "1.1.0"
description = "Accurately remove and replace emojis in text strings"
optional = false
python-versions = ">=3.6"
groups = ["main"]
files = [
{file = "demoji-1.1.0-py3-none-any.whl", hash = "sha256:6d3256c909aea299e97fe984f827a2a060c2a8f8bfcbafa7ec9659967c5df50f"},
{file = "demoji-1.1.0.tar.gz", hash = "sha256:072efaeca725e6f63ab59d83abeb55b178842538ed9256455a82ebbd055ff216"},
]
[package.extras]
ujson = ["ujson"]
[[package]] [[package]]
name = "exceptiongroup" name = "exceptiongroup"
version = "1.3.0" version = "1.3.0"
@ -1060,18 +1045,6 @@ files = [
{file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"}, {file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"},
] ]
[[package]]
name = "py-cpuinfo"
version = "9.0.0"
description = "Get CPU info with pure Python"
optional = false
python-versions = "*"
groups = ["dev"]
files = [
{file = "py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690"},
{file = "py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5"},
]
[[package]] [[package]]
name = "pybind11" name = "pybind11"
version = "2.13.6" version = "2.13.6"
@ -1125,27 +1098,6 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""}
[package.extras] [package.extras]
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "pytest-benchmark"
version = "5.1.0"
description = "A ``pytest`` fixture for benchmarking code. It will group the tests into rounds that are calibrated to the chosen timer."
optional = false
python-versions = ">=3.9"
groups = ["dev"]
files = [
{file = "pytest-benchmark-5.1.0.tar.gz", hash = "sha256:9ea661cdc292e8231f7cd4c10b0319e56a2118e2c09d9f50e1b3d150d2aca105"},
{file = "pytest_benchmark-5.1.0-py3-none-any.whl", hash = "sha256:922de2dfa3033c227c96da942d1878191afa135a29485fb942e85dff1c592c89"},
]
[package.dependencies]
py-cpuinfo = "*"
pytest = ">=8.1"
[package.extras]
aspect = ["aspectlib"]
elasticsearch = ["elasticsearch"]
histogram = ["pygal", "pygaljs", "setuptools"]
[[package]] [[package]]
name = "python-dateutil" name = "python-dateutil"
version = "2.9.0.post0" version = "2.9.0.post0"
@ -1291,6 +1243,28 @@ files = [
{file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"},
] ]
[[package]]
name = "smart-open"
version = "5.1.0"
description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
optional = false
python-versions = ">=3.6.*"
groups = ["main"]
markers = "python_version >= \"3.12\""
files = [
{file = "smart_open-5.1.0-py3-none-any.whl", hash = "sha256:2059b07f530c8c9e2158e4e1575309aacb74bd813da2325c1f348015d04f3bd6"},
{file = "smart_open-5.1.0.tar.gz", hash = "sha256:e4dc1350b240ef0759e343e4e2f361bfd4e5477bb2619866e97f80240652e92e"},
]
[package.extras]
all = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage", "requests"]
azure = ["azure-common", "azure-core", "azure-storage-blob"]
gcs = ["google-cloud-storage"]
http = ["requests"]
s3 = ["boto3"]
test = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage", "moto[server] (==1.3.14)", "parameterizedtestcase", "paramiko", "pathlib2", "pytest", "pytest-rerunfailures", "requests", "responses"]
webhdfs = ["requests"]
[[package]] [[package]]
name = "smart-open" name = "smart-open"
version = "7.1.0" version = "7.1.0"
@ -1298,6 +1272,7 @@ description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storag
optional = false optional = false
python-versions = "<4.0,>=3.7" python-versions = "<4.0,>=3.7"
groups = ["main"] groups = ["main"]
markers = "python_version <= \"3.11\""
files = [ files = [
{file = "smart_open-7.1.0-py3-none-any.whl", hash = "sha256:4b8489bb6058196258bafe901730c7db0dcf4f083f316e97269c66f45502055b"}, {file = "smart_open-7.1.0-py3-none-any.whl", hash = "sha256:4b8489bb6058196258bafe901730c7db0dcf4f083f316e97269c66f45502055b"},
{file = "smart_open-7.1.0.tar.gz", hash = "sha256:a4f09f84f0f6d3637c6543aca7b5487438877a21360e7368ccf1f704789752ba"}, {file = "smart_open-7.1.0.tar.gz", hash = "sha256:a4f09f84f0f6d3637c6543aca7b5487438877a21360e7368ccf1f704789752ba"},
@ -1747,5 +1722,5 @@ files = [
[metadata] [metadata]
lock-version = "2.1" lock-version = "2.1"
python-versions = "^3.10" python-versions = ">=3.10"
content-hash = "2a469cf6cd729d58a4315152a037a242fdc09dba63fe3adfe00bbb88c3f16863" content-hash = "3315c4aedc40f50a78569149ca711d514b9a06b30c9c3b5a6f1402e5abf7e032"

Просмотреть файл

@ -1,32 +1,32 @@
[tool.poetry] [project]
name = "d3lta" name = "d3lta"
version = "1.0.2" version = "1.0.2"
description = "A library for detecting verbatim-duplicated contents within a vast amount of documents" description = "A library for detecting verbatim-duplicated contents within a vast amount of documents"
readme = "README.md" readme = "README.md"
authors = ["Viginum"] authors = [{ name = "VIGINUM" }]
license = { file = "LICENSE.txt" }
requires-python = ">=3.10"
dependencies = [
"faiss-cpu==1.9.0.post1",
"fasttext==0.9.3",
"gensim==4.3.3",
"networkx==2.8.8",
"pandas==2.2.3",
"polyleven==0.8",
"scipy==1.12.0",
"tensorflow==2.18.0",
"tensorflow-hub==0.16.1",
"tensorflow-text==2.18.1",
"tqdm==4.67.1",
]
[tool.poetry.dependencies]
python = "^3.10"
demoji = "^1.1.0"
faiss-cpu = "1.9.0.post1"
fasttext = "0.9.3"
gensim = "4.3.3"
networkx = "2.8.8"
pandas = "2.2.3"
polyleven = "0.8"
scipy = "1.12.0"
tensorflow = "2.18.0"
tensorflow-hub = "0.16.1"
tensorflow-text = "2.18.1"
tqdm = "4.67.1"
[tool.poetry.group.dev] [tool.poetry.group.dev]
optional = true optional = true
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
pytest = "^8.3.5" pytest = "^8.3.5"
pytest-benchmark = "^5.1.0"
[build-system] [build-system]
requires = ["setuptools", "poetry-core"] requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"

Просмотреть файл

@ -1,4 +0,0 @@
#!/usr/bin/env python
from setuptools import setup
setup()

Просмотреть файл

@ -6,9 +6,6 @@ from get_unicode_emojis_list import (
EMOJI_TESTFILE_FILENAME, EMOJI_TESTFILE_FILENAME,
get_all_emojis_from_latest_unicode_emojis_specification_with_download, get_all_emojis_from_latest_unicode_emojis_specification_with_download,
) )
from pytest_benchmark.fixture import (
BenchmarkFixture,
)
import d3lta.emojis_remover import d3lta.emojis_remover
@ -17,13 +14,6 @@ import d3lta.emojis_remover
name="emojis_remover", name="emojis_remover",
params=[ params=[
d3lta.emojis_remover.ExplicitUnicodeBlocksEmojisRemover, d3lta.emojis_remover.ExplicitUnicodeBlocksEmojisRemover,
pytest.param(
d3lta.emojis_remover.DemojiEmojisRemover,
marks=pytest.mark.xfail(
reason="`demoji`'s detection engine does not detect all emojis in the Unicode specification",
strict=True,
),
),
], ],
) )
def fixture_emojis_remover( def fixture_emojis_remover(
@ -108,11 +98,8 @@ In consequence whereof, the National 🏞️ Assembly 👩‍🏭👨‍🏭 r
def test_on_text_sample( def test_on_text_sample(
emojis_remover: d3lta.emojis_remover.EmojisRemover, emojis_remover: d3lta.emojis_remover.EmojisRemover,
sample_text_with_emojipasta: str, sample_text_with_emojipasta: str,
sample_text: str, sample_text: str
benchmark: BenchmarkFixture,
): ):
processed = benchmark( assert emojis_remover.remove_symbols(
emojis_remover.remove_symbols,
sample_text_with_emojipasta, sample_text_with_emojipasta,
) ) == sample_text
assert processed == sample_text