зеркало из
https://github.com/VIGINUM-FR/D3lta.git
synced 2025-10-29 21:16:20 +02:00
Сравнить коммиты
26 Коммитов
| Автор | SHA1 | Дата | |
|---|---|---|---|
|
|
8df5772178 | ||
|
|
a18992748e | ||
|
|
045bd4becf | ||
|
|
3cdea198bb | ||
|
|
1bd593cf43 | ||
|
|
b41ab2ce19 | ||
|
|
1151e21254 | ||
|
|
b8fada79c2 | ||
|
|
95a07bd5a3 | ||
|
|
e0c747f43c | ||
|
|
588f20cd4a | ||
|
|
a92770562b | ||
|
|
991ed8141b | ||
|
|
f5f71cca37 | ||
|
|
b1d2b93c24 | ||
|
|
71a76b0d3a | ||
|
|
ed3f0b9db3 | ||
|
|
8999d23448 | ||
|
|
427a873568 | ||
|
|
0386589b46 | ||
|
|
c589aebc41 | ||
|
|
56a1f07c1e | ||
|
|
fb7531405c | ||
|
|
80f12d6ee9 | ||
|
|
eb1599ee10 | ||
|
|
c7107aae1d |
@ -17,9 +17,9 @@ RUN pip install --no-cache-dir --upgrade pip
|
||||
RUN pipx install poetry==${POETRY_VERSION}
|
||||
|
||||
WORKDIR /app
|
||||
COPY pyproject.toml poetry.lock setup.py README.md ./
|
||||
COPY pyproject.toml poetry.lock README.md LICENSE.txt ./
|
||||
# pre-install dependencies
|
||||
RUN --mount=type=cache,target=/root/.cache poetry install --no-root
|
||||
RUN --mount=type=cache,target=/root/.cache poetry install --no-root
|
||||
|
||||
COPY notebooks /app/notebooks/
|
||||
COPY d3lta /app/d3lta/
|
||||
@ -27,4 +27,8 @@ RUN --mount=type=cache,target=/root/.cache poetry install
|
||||
|
||||
FROM d3lta-prod AS d3lta-dev
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache poetry install --with dev
|
||||
RUN --mount=type=cache,target=/root/.cache poetry install --with dev
|
||||
|
||||
# install nektos/act as specified in https://nektosact.com/installation/index.html#bash-script
|
||||
# the -b flag specifies the target directory (cf. https://github.com/nektos/act/blob/61396d8085a9d812cebf94fa954f5938d48bf2b9/install.sh#L13)
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash -s -- -b /usr/bin
|
||||
|
||||
@ -4,6 +4,14 @@
|
||||
"name": "Python 3",
|
||||
"dockerFile": "./Dockerfile",
|
||||
"context": "..",
|
||||
// Features to add to the dev container. More info: https://containers.dev/features.
|
||||
// "features": {}
|
||||
}
|
||||
"customizations": {
|
||||
"vscode": {
|
||||
"extensions": [
|
||||
"github.vscode-github-actions"
|
||||
]
|
||||
}
|
||||
},
|
||||
"features": {
|
||||
"ghcr.io/devcontainers/features/docker-in-docker:2": {}
|
||||
}
|
||||
}
|
||||
|
||||
166
.github/workflows/publish-to-pypi.yml
поставляемый
166
.github/workflows/publish-to-pypi.yml
поставляемый
@ -1,63 +1,69 @@
|
||||
# derived from https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/#the-whole-ci-cd-workflow
|
||||
name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
|
||||
name: Publish Python distribution to PyPI
|
||||
|
||||
on: push
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
env:
|
||||
ACT: false # env.ACT == true when running inside nektos/act
|
||||
|
||||
jobs:
|
||||
unit-test:
|
||||
name: Run unit tests 🤾
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
persist-credentials: false
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install poetry
|
||||
poetry install --with dev
|
||||
- name: Build a binary wheel and a source tarball
|
||||
run:
|
||||
poetry run pytest
|
||||
|
||||
build:
|
||||
name: Build distribution 📦
|
||||
needs:
|
||||
- unit-test
|
||||
name: Build distribution
|
||||
# based on https://stackoverflow.com/a/74318141
|
||||
if: ${{ github.event.release.target_commitish == 'main'}}
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.x"
|
||||
- name: Install pypa/build
|
||||
run: >-
|
||||
python3 -m
|
||||
pip install
|
||||
build
|
||||
--user
|
||||
|
||||
- name: Build a binary wheel and a source tarball
|
||||
run: python3 -m build
|
||||
run: pipx run build
|
||||
|
||||
- name: Store the distribution packages
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: python-package-distributions
|
||||
name: distfiles
|
||||
path: dist/
|
||||
if-no-files-found: error
|
||||
|
||||
# taken from https://github.com/python-poetry/poetry/blob/b580e8aa4fbce53569420e7b42568dfd9e73519f/.github/workflows/release.yaml
|
||||
upload-built-distribution-to-github-release:
|
||||
name: Upload (GitHub)
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
needs: build
|
||||
steps:
|
||||
# Checking-out the project since the gh CLI expects to be called in the context of a git repository.
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Retrieve built distribution
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: distfiles
|
||||
path: dist/
|
||||
|
||||
- run: gh release upload "${TAG_NAME}" dist/*.{tar.gz,whl}
|
||||
# skip step when debugging locally via nektos/act
|
||||
if: ${{ !env.ACT }}
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
TAG_NAME: ${{ github.event.release.tag_name }}
|
||||
|
||||
publish-to-pypi:
|
||||
name: >-
|
||||
Publish Python 🐍 distribution 📦 to PyPI
|
||||
if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
|
||||
needs:
|
||||
- build
|
||||
name: Publish Python distribution to PyPI
|
||||
needs: build
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: pypi
|
||||
@ -66,77 +72,15 @@ jobs:
|
||||
id-token: write # IMPORTANT: mandatory for trusted publishing
|
||||
|
||||
steps:
|
||||
- name: Download all the dists
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: python-package-distributions
|
||||
path: dist/
|
||||
- name: Publish distribution 📦 to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
- name: Retrieve built distribution
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: distfiles
|
||||
path: dist/
|
||||
|
||||
github-release:
|
||||
name: >-
|
||||
Sign the Python 🐍 distribution 📦 with Sigstore
|
||||
and upload them to GitHub Release
|
||||
needs:
|
||||
- publish-to-pypi
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: write # IMPORTANT: mandatory for making GitHub Releases
|
||||
id-token: write # IMPORTANT: mandatory for sigstore
|
||||
|
||||
steps:
|
||||
- name: Download all the dists
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: python-package-distributions
|
||||
path: dist/
|
||||
- name: Sign the dists with Sigstore
|
||||
uses: sigstore/gh-action-sigstore-python@v3.0.0
|
||||
with:
|
||||
inputs: >-
|
||||
./dist/*.tar.gz
|
||||
./dist/*.whl
|
||||
- name: Create GitHub Release
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ github.token }}
|
||||
run: >-
|
||||
gh release create
|
||||
"$GITHUB_REF_NAME"
|
||||
--repo "$GITHUB_REPOSITORY"
|
||||
--notes ""
|
||||
- name: Upload artifact signatures to GitHub Release
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ github.token }}
|
||||
# Upload to GitHub Release using the `gh` CLI.
|
||||
# `dist/` contains the built packages, and the
|
||||
# sigstore-produced signatures and certificates.
|
||||
run: >-
|
||||
gh release upload
|
||||
"$GITHUB_REF_NAME" dist/**
|
||||
--repo "$GITHUB_REPOSITORY"
|
||||
|
||||
publish-to-testpypi:
|
||||
name: Publish Python 🐍 distribution 📦 to TestPyPI
|
||||
needs:
|
||||
- build
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
environment:
|
||||
name: testpypi
|
||||
url: https://test.pypi.org/p/d3lta # pypi is case insensitive so d3lta == D3lta
|
||||
|
||||
permissions:
|
||||
id-token: write # IMPORTANT: mandatory for trusted publishing
|
||||
|
||||
steps:
|
||||
- name: Download all the dists
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: python-package-distributions
|
||||
path: dist/
|
||||
- name: Publish distribution 📦 to TestPyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
repository-url: https://test.pypi.org/legacy/
|
||||
- name: Publish distribution to PyPI
|
||||
# skip step when debugging locally via nektos/act
|
||||
if: ${{ !env.ACT }}
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
print-hash: true
|
||||
|
||||
28
.github/workflows/test.yml
поставляемый
Обычный файл
28
.github/workflows/test.yml
поставляемый
Обычный файл
@ -0,0 +1,28 @@
|
||||
name: Run tests
|
||||
|
||||
on: push
|
||||
|
||||
jobs:
|
||||
unit-test:
|
||||
name: Run unit tests
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Install poetry
|
||||
run: pipx install poetry
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
cache: poetry
|
||||
|
||||
- name: Install dependencies
|
||||
run: poetry install --with dev
|
||||
|
||||
- name: Run tests
|
||||
run: poetry run pytest
|
||||
3
.gitignore
поставляемый
3
.gitignore
поставляемый
@ -228,4 +228,5 @@ pyrightconfig.json
|
||||
*.ftz
|
||||
use_model_kaggle/
|
||||
include/
|
||||
.benchmarks
|
||||
.benchmarks
|
||||
.act-event.json
|
||||
62
README.md
62
README.md
@ -1,43 +1,22 @@
|
||||
<h2 align="center"> <a href="https://arxiv.org/abs/2312.17338">D3lta</a></h2>
|
||||
# D3lta
|
||||
|
||||
<h5 align="center">
|
||||
|
||||
If you like our project, please give us a star ⭐ on GitHub for the latest update. </h2>
|
||||
|
||||
</h5>
|
||||
|
||||
<div align=center>
|
||||
|
||||
[](https://arxiv.org/abs/2312.17338)
|
||||
[](https://pypi.org/project/d3lta/)
|
||||
[](https://arxiv.org/abs/2312.17338)
|
||||
|
||||
This repository is the official implementation of D3lta, a library for detecting duplicate verbatim contents within a vast amount of documents.
|
||||
|
||||
It distinguishes 3 types of duplicate contents : copypasta (almost exact duplicates), rewording and translation. You can run it on CPU.
|
||||
</div>
|
||||
|
||||
---
|
||||
|
||||
<img style="display: block; margin: auto;" src="https://github.com/VIGINUM-FR/D3lta/raw/main/static/graph.gif"/>
|
||||
|
||||
|
||||
## 💻 Installing
|
||||
|
||||
Clone the repository
|
||||
## 💻 Installation
|
||||
|
||||
```bash
|
||||
git clone https://github.com/VIGINUM-FR/D3lta
|
||||
```
|
||||
|
||||
Navigate to the project
|
||||
|
||||
```bash
|
||||
cd D3lta
|
||||
```
|
||||
|
||||
Install the package
|
||||
|
||||
```bash
|
||||
pip install -e .
|
||||
# PyPI is case insensitive, so d3lta == D3lta
|
||||
pip install d3lta
|
||||
```
|
||||
|
||||
## 🚀 Quick start
|
||||
@ -163,11 +142,10 @@ matches, df_clusters = semantic_faiss(
|
||||
matches
|
||||
```
|
||||
|
||||
|
||||
|
||||
## 📚 Synthetic dataset
|
||||
|
||||
The dataset is available in the release `1.0.0`. It contains the following files:
|
||||
The dataset is available in the [`1.0.0` release](https://github.com/VIGINUM-FR/D3lta/releases/tag/1.0.0).
|
||||
It contains the following files:
|
||||
|
||||
### `synthetic_dataset_documents.csv`:
|
||||
|
||||
@ -205,10 +183,30 @@ Column details:
|
||||
|
||||
## Notebooks
|
||||
|
||||
In folder the [`notebooks`](./notebooks/), you can find:
|
||||
- [`example_synthetic_dataset.ipynb`](./notebooks/example_synthetic_dataset.ipynb): Example of applying threedelta methodology to the synthetic dataset, with a comparison to the true labels.
|
||||
In the [`notebooks`](./notebooks/) directory, you can find:
|
||||
- [`example_synthetic_dataset.ipynb`](./notebooks/example_synthetic_dataset.ipynb): example of applying the D3lta methodology to the synthetic dataset, with a comparison to the true labels.
|
||||
|
||||
|
||||
## 👩💻 Developing
|
||||
|
||||
Clone the repository
|
||||
|
||||
```bash
|
||||
git clone https://github.com/VIGINUM-FR/D3lta
|
||||
```
|
||||
|
||||
Navigate to the project
|
||||
|
||||
```bash
|
||||
cd D3lta
|
||||
```
|
||||
|
||||
Install the package
|
||||
|
||||
```bash
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
## Citation
|
||||
|
||||
If you find our paper and code useful in your research, please consider giving a star 🌟 and a citation 📝:
|
||||
|
||||
@ -3,8 +3,6 @@ from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import final
|
||||
|
||||
import demoji
|
||||
|
||||
|
||||
@dataclass
|
||||
class EmojisRemover(ABC):
|
||||
@ -96,8 +94,3 @@ class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover):
|
||||
|
||||
def _remove_symbols_implementation(self, text: str) -> str:
|
||||
return self.SYMBOLS_REGEX.sub(r"", text)
|
||||
|
||||
|
||||
class DemojiEmojisRemover(EmojisRemover):
|
||||
def _remove_symbols_implementation(self, text: str) -> str:
|
||||
return demoji.replace(text)
|
||||
|
||||
75
poetry.lock
сгенерированный
75
poetry.lock
сгенерированный
@ -155,21 +155,6 @@ files = [
|
||||
]
|
||||
markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""}
|
||||
|
||||
[[package]]
|
||||
name = "demoji"
|
||||
version = "1.1.0"
|
||||
description = "Accurately remove and replace emojis in text strings"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "demoji-1.1.0-py3-none-any.whl", hash = "sha256:6d3256c909aea299e97fe984f827a2a060c2a8f8bfcbafa7ec9659967c5df50f"},
|
||||
{file = "demoji-1.1.0.tar.gz", hash = "sha256:072efaeca725e6f63ab59d83abeb55b178842538ed9256455a82ebbd055ff216"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
ujson = ["ujson"]
|
||||
|
||||
[[package]]
|
||||
name = "exceptiongroup"
|
||||
version = "1.3.0"
|
||||
@ -1060,18 +1045,6 @@ files = [
|
||||
{file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "py-cpuinfo"
|
||||
version = "9.0.0"
|
||||
description = "Get CPU info with pure Python"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690"},
|
||||
{file = "py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pybind11"
|
||||
version = "2.13.6"
|
||||
@ -1125,27 +1098,6 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""}
|
||||
[package.extras]
|
||||
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-benchmark"
|
||||
version = "5.1.0"
|
||||
description = "A ``pytest`` fixture for benchmarking code. It will group the tests into rounds that are calibrated to the chosen timer."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "pytest-benchmark-5.1.0.tar.gz", hash = "sha256:9ea661cdc292e8231f7cd4c10b0319e56a2118e2c09d9f50e1b3d150d2aca105"},
|
||||
{file = "pytest_benchmark-5.1.0-py3-none-any.whl", hash = "sha256:922de2dfa3033c227c96da942d1878191afa135a29485fb942e85dff1c592c89"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
py-cpuinfo = "*"
|
||||
pytest = ">=8.1"
|
||||
|
||||
[package.extras]
|
||||
aspect = ["aspectlib"]
|
||||
elasticsearch = ["elasticsearch"]
|
||||
histogram = ["pygal", "pygaljs", "setuptools"]
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.9.0.post0"
|
||||
@ -1291,6 +1243,28 @@ files = [
|
||||
{file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "smart-open"
|
||||
version = "5.1.0"
|
||||
description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
|
||||
optional = false
|
||||
python-versions = ">=3.6.*"
|
||||
groups = ["main"]
|
||||
markers = "python_version >= \"3.12\""
|
||||
files = [
|
||||
{file = "smart_open-5.1.0-py3-none-any.whl", hash = "sha256:2059b07f530c8c9e2158e4e1575309aacb74bd813da2325c1f348015d04f3bd6"},
|
||||
{file = "smart_open-5.1.0.tar.gz", hash = "sha256:e4dc1350b240ef0759e343e4e2f361bfd4e5477bb2619866e97f80240652e92e"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
all = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage", "requests"]
|
||||
azure = ["azure-common", "azure-core", "azure-storage-blob"]
|
||||
gcs = ["google-cloud-storage"]
|
||||
http = ["requests"]
|
||||
s3 = ["boto3"]
|
||||
test = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage", "moto[server] (==1.3.14)", "parameterizedtestcase", "paramiko", "pathlib2", "pytest", "pytest-rerunfailures", "requests", "responses"]
|
||||
webhdfs = ["requests"]
|
||||
|
||||
[[package]]
|
||||
name = "smart-open"
|
||||
version = "7.1.0"
|
||||
@ -1298,6 +1272,7 @@ description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storag
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.7"
|
||||
groups = ["main"]
|
||||
markers = "python_version <= \"3.11\""
|
||||
files = [
|
||||
{file = "smart_open-7.1.0-py3-none-any.whl", hash = "sha256:4b8489bb6058196258bafe901730c7db0dcf4f083f316e97269c66f45502055b"},
|
||||
{file = "smart_open-7.1.0.tar.gz", hash = "sha256:a4f09f84f0f6d3637c6543aca7b5487438877a21360e7368ccf1f704789752ba"},
|
||||
@ -1747,5 +1722,5 @@ files = [
|
||||
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "2a469cf6cd729d58a4315152a037a242fdc09dba63fe3adfe00bbb88c3f16863"
|
||||
python-versions = ">=3.10"
|
||||
content-hash = "3315c4aedc40f50a78569149ca711d514b9a06b30c9c3b5a6f1402e5abf7e032"
|
||||
|
||||
@ -1,32 +1,32 @@
|
||||
[tool.poetry]
|
||||
[project]
|
||||
name = "d3lta"
|
||||
version = "1.0.1"
|
||||
version = "1.0.2"
|
||||
description = "A library for detecting verbatim-duplicated contents within a vast amount of documents"
|
||||
readme = "README.md"
|
||||
authors = ["Viginum"]
|
||||
authors = [{ name = "VIGINUM" }]
|
||||
license = { file = "LICENSE.txt" }
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"faiss-cpu==1.9.0.post1",
|
||||
"fasttext==0.9.3",
|
||||
"gensim==4.3.3",
|
||||
"networkx==2.8.8",
|
||||
"pandas==2.2.3",
|
||||
"polyleven==0.8",
|
||||
"scipy==1.12.0",
|
||||
"tensorflow==2.18.0",
|
||||
"tensorflow-hub==0.16.1",
|
||||
"tensorflow-text==2.18.1",
|
||||
"tqdm==4.67.1",
|
||||
]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.10"
|
||||
demoji = "^1.1.0"
|
||||
faiss-cpu = "1.9.0.post1"
|
||||
fasttext = "0.9.3"
|
||||
gensim = "4.3.3"
|
||||
networkx = "2.8.8"
|
||||
pandas = "2.2.3"
|
||||
polyleven = "0.8"
|
||||
scipy = "1.12.0"
|
||||
tensorflow = "2.18.0"
|
||||
tensorflow-hub = "0.16.1"
|
||||
tensorflow-text = "2.18.1"
|
||||
tqdm = "4.67.1"
|
||||
|
||||
[tool.poetry.group.dev]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^8.3.5"
|
||||
pytest-benchmark = "^5.1.0"
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools", "poetry-core"]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
4
setup.py
4
setup.py
@ -1,4 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
from setuptools import setup
|
||||
|
||||
setup()
|
||||
@ -6,9 +6,6 @@ from get_unicode_emojis_list import (
|
||||
EMOJI_TESTFILE_FILENAME,
|
||||
get_all_emojis_from_latest_unicode_emojis_specification_with_download,
|
||||
)
|
||||
from pytest_benchmark.fixture import (
|
||||
BenchmarkFixture,
|
||||
)
|
||||
|
||||
import d3lta.emojis_remover
|
||||
|
||||
@ -17,13 +14,6 @@ import d3lta.emojis_remover
|
||||
name="emojis_remover",
|
||||
params=[
|
||||
d3lta.emojis_remover.ExplicitUnicodeBlocksEmojisRemover,
|
||||
pytest.param(
|
||||
d3lta.emojis_remover.DemojiEmojisRemover,
|
||||
marks=pytest.mark.xfail(
|
||||
reason="`demoji`'s detection engine does not detect all emojis in the Unicode specification",
|
||||
strict=True,
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def fixture_emojis_remover(
|
||||
@ -108,11 +98,8 @@ In consequence whereof, the National 🏞️ Assembly 👩🏭👨🏭 r
|
||||
def test_on_text_sample(
|
||||
emojis_remover: d3lta.emojis_remover.EmojisRemover,
|
||||
sample_text_with_emojipasta: str,
|
||||
sample_text: str,
|
||||
benchmark: BenchmarkFixture,
|
||||
sample_text: str
|
||||
):
|
||||
processed = benchmark(
|
||||
emojis_remover.remove_symbols,
|
||||
assert emojis_remover.remove_symbols(
|
||||
sample_text_with_emojipasta,
|
||||
)
|
||||
assert processed == sample_text
|
||||
) == sample_text
|
||||
|
||||
Загрузка…
x
Ссылка в новой задаче
Block a user