зеркало из
https://github.com/VIGINUM-FR/D3lta.git
synced 2025-10-29 21:16:20 +02:00
Сравнить коммиты
26 Коммитов
| Автор | SHA1 | Дата | |
|---|---|---|---|
|
|
8df5772178 | ||
|
|
a18992748e | ||
|
|
045bd4becf | ||
|
|
3cdea198bb | ||
|
|
1bd593cf43 | ||
|
|
b41ab2ce19 | ||
|
|
1151e21254 | ||
|
|
b8fada79c2 | ||
|
|
95a07bd5a3 | ||
|
|
e0c747f43c | ||
|
|
588f20cd4a | ||
|
|
a92770562b | ||
|
|
991ed8141b | ||
|
|
f5f71cca37 | ||
|
|
b1d2b93c24 | ||
|
|
71a76b0d3a | ||
|
|
ed3f0b9db3 | ||
|
|
8999d23448 | ||
|
|
427a873568 | ||
|
|
0386589b46 | ||
|
|
c589aebc41 | ||
|
|
56a1f07c1e | ||
|
|
fb7531405c | ||
|
|
80f12d6ee9 | ||
|
|
eb1599ee10 | ||
|
|
c7107aae1d |
@ -17,9 +17,9 @@ RUN pip install --no-cache-dir --upgrade pip
|
|||||||
RUN pipx install poetry==${POETRY_VERSION}
|
RUN pipx install poetry==${POETRY_VERSION}
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY pyproject.toml poetry.lock setup.py README.md ./
|
COPY pyproject.toml poetry.lock README.md LICENSE.txt ./
|
||||||
# pre-install dependencies
|
# pre-install dependencies
|
||||||
RUN --mount=type=cache,target=/root/.cache poetry install --no-root
|
RUN --mount=type=cache,target=/root/.cache poetry install --no-root
|
||||||
|
|
||||||
COPY notebooks /app/notebooks/
|
COPY notebooks /app/notebooks/
|
||||||
COPY d3lta /app/d3lta/
|
COPY d3lta /app/d3lta/
|
||||||
@ -27,4 +27,8 @@ RUN --mount=type=cache,target=/root/.cache poetry install
|
|||||||
|
|
||||||
FROM d3lta-prod AS d3lta-dev
|
FROM d3lta-prod AS d3lta-dev
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache poetry install --with dev
|
RUN --mount=type=cache,target=/root/.cache poetry install --with dev
|
||||||
|
|
||||||
|
# install nektos/act as specified in https://nektosact.com/installation/index.html#bash-script
|
||||||
|
# the -b flag specifies the target directory (cf. https://github.com/nektos/act/blob/61396d8085a9d812cebf94fa954f5938d48bf2b9/install.sh#L13)
|
||||||
|
RUN curl --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash -s -- -b /usr/bin
|
||||||
|
|||||||
@ -4,6 +4,14 @@
|
|||||||
"name": "Python 3",
|
"name": "Python 3",
|
||||||
"dockerFile": "./Dockerfile",
|
"dockerFile": "./Dockerfile",
|
||||||
"context": "..",
|
"context": "..",
|
||||||
// Features to add to the dev container. More info: https://containers.dev/features.
|
"customizations": {
|
||||||
// "features": {}
|
"vscode": {
|
||||||
}
|
"extensions": [
|
||||||
|
"github.vscode-github-actions"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"features": {
|
||||||
|
"ghcr.io/devcontainers/features/docker-in-docker:2": {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
166
.github/workflows/publish-to-pypi.yml
поставляемый
166
.github/workflows/publish-to-pypi.yml
поставляемый
@ -1,63 +1,69 @@
|
|||||||
# derived from https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/#the-whole-ci-cd-workflow
|
# derived from https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/#the-whole-ci-cd-workflow
|
||||||
name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
|
name: Publish Python distribution to PyPI
|
||||||
|
|
||||||
on: push
|
on:
|
||||||
|
release:
|
||||||
|
types: [published]
|
||||||
|
|
||||||
|
env:
|
||||||
|
ACT: false # env.ACT == true when running inside nektos/act
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
unit-test:
|
|
||||||
name: Run unit tests 🤾
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
persist-credentials: false
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip install poetry
|
|
||||||
poetry install --with dev
|
|
||||||
- name: Build a binary wheel and a source tarball
|
|
||||||
run:
|
|
||||||
poetry run pytest
|
|
||||||
|
|
||||||
build:
|
build:
|
||||||
name: Build distribution 📦
|
name: Build distribution
|
||||||
needs:
|
# based on https://stackoverflow.com/a/74318141
|
||||||
- unit-test
|
if: ${{ github.event.release.target_commitish == 'main'}}
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
persist-credentials: false
|
persist-credentials: false
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.x"
|
python-version: "3.x"
|
||||||
- name: Install pypa/build
|
|
||||||
run: >-
|
|
||||||
python3 -m
|
|
||||||
pip install
|
|
||||||
build
|
|
||||||
--user
|
|
||||||
- name: Build a binary wheel and a source tarball
|
- name: Build a binary wheel and a source tarball
|
||||||
run: python3 -m build
|
run: pipx run build
|
||||||
|
|
||||||
- name: Store the distribution packages
|
- name: Store the distribution packages
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: python-package-distributions
|
name: distfiles
|
||||||
path: dist/
|
path: dist/
|
||||||
|
if-no-files-found: error
|
||||||
|
|
||||||
|
# taken from https://github.com/python-poetry/poetry/blob/b580e8aa4fbce53569420e7b42568dfd9e73519f/.github/workflows/release.yaml
|
||||||
|
upload-built-distribution-to-github-release:
|
||||||
|
name: Upload (GitHub)
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
needs: build
|
||||||
|
steps:
|
||||||
|
# Checking-out the project since the gh CLI expects to be called in the context of a git repository.
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
persist-credentials: false
|
||||||
|
|
||||||
|
- name: Retrieve built distribution
|
||||||
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
name: distfiles
|
||||||
|
path: dist/
|
||||||
|
|
||||||
|
- run: gh release upload "${TAG_NAME}" dist/*.{tar.gz,whl}
|
||||||
|
# skip step when debugging locally via nektos/act
|
||||||
|
if: ${{ !env.ACT }}
|
||||||
|
env:
|
||||||
|
GH_TOKEN: ${{ github.token }}
|
||||||
|
TAG_NAME: ${{ github.event.release.tag_name }}
|
||||||
|
|
||||||
publish-to-pypi:
|
publish-to-pypi:
|
||||||
name: >-
|
name: Publish Python distribution to PyPI
|
||||||
Publish Python 🐍 distribution 📦 to PyPI
|
needs: build
|
||||||
if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
|
|
||||||
needs:
|
|
||||||
- build
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
environment:
|
environment:
|
||||||
name: pypi
|
name: pypi
|
||||||
@ -66,77 +72,15 @@ jobs:
|
|||||||
id-token: write # IMPORTANT: mandatory for trusted publishing
|
id-token: write # IMPORTANT: mandatory for trusted publishing
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Download all the dists
|
- name: Retrieve built distribution
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: python-package-distributions
|
name: distfiles
|
||||||
path: dist/
|
path: dist/
|
||||||
- name: Publish distribution 📦 to PyPI
|
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
|
||||||
|
|
||||||
github-release:
|
- name: Publish distribution to PyPI
|
||||||
name: >-
|
# skip step when debugging locally via nektos/act
|
||||||
Sign the Python 🐍 distribution 📦 with Sigstore
|
if: ${{ !env.ACT }}
|
||||||
and upload them to GitHub Release
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
needs:
|
with:
|
||||||
- publish-to-pypi
|
print-hash: true
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: write # IMPORTANT: mandatory for making GitHub Releases
|
|
||||||
id-token: write # IMPORTANT: mandatory for sigstore
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Download all the dists
|
|
||||||
uses: actions/download-artifact@v4
|
|
||||||
with:
|
|
||||||
name: python-package-distributions
|
|
||||||
path: dist/
|
|
||||||
- name: Sign the dists with Sigstore
|
|
||||||
uses: sigstore/gh-action-sigstore-python@v3.0.0
|
|
||||||
with:
|
|
||||||
inputs: >-
|
|
||||||
./dist/*.tar.gz
|
|
||||||
./dist/*.whl
|
|
||||||
- name: Create GitHub Release
|
|
||||||
env:
|
|
||||||
GITHUB_TOKEN: ${{ github.token }}
|
|
||||||
run: >-
|
|
||||||
gh release create
|
|
||||||
"$GITHUB_REF_NAME"
|
|
||||||
--repo "$GITHUB_REPOSITORY"
|
|
||||||
--notes ""
|
|
||||||
- name: Upload artifact signatures to GitHub Release
|
|
||||||
env:
|
|
||||||
GITHUB_TOKEN: ${{ github.token }}
|
|
||||||
# Upload to GitHub Release using the `gh` CLI.
|
|
||||||
# `dist/` contains the built packages, and the
|
|
||||||
# sigstore-produced signatures and certificates.
|
|
||||||
run: >-
|
|
||||||
gh release upload
|
|
||||||
"$GITHUB_REF_NAME" dist/**
|
|
||||||
--repo "$GITHUB_REPOSITORY"
|
|
||||||
|
|
||||||
publish-to-testpypi:
|
|
||||||
name: Publish Python 🐍 distribution 📦 to TestPyPI
|
|
||||||
needs:
|
|
||||||
- build
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
environment:
|
|
||||||
name: testpypi
|
|
||||||
url: https://test.pypi.org/p/d3lta # pypi is case insensitive so d3lta == D3lta
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
id-token: write # IMPORTANT: mandatory for trusted publishing
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Download all the dists
|
|
||||||
uses: actions/download-artifact@v4
|
|
||||||
with:
|
|
||||||
name: python-package-distributions
|
|
||||||
path: dist/
|
|
||||||
- name: Publish distribution 📦 to TestPyPI
|
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
|
||||||
with:
|
|
||||||
repository-url: https://test.pypi.org/legacy/
|
|
||||||
|
|||||||
28
.github/workflows/test.yml
поставляемый
Обычный файл
28
.github/workflows/test.yml
поставляемый
Обычный файл
@ -0,0 +1,28 @@
|
|||||||
|
name: Run tests
|
||||||
|
|
||||||
|
on: push
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
unit-test:
|
||||||
|
name: Run unit tests
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
persist-credentials: false
|
||||||
|
|
||||||
|
- name: Install poetry
|
||||||
|
run: pipx install poetry
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.11"
|
||||||
|
cache: poetry
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: poetry install --with dev
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
run: poetry run pytest
|
||||||
3
.gitignore
поставляемый
3
.gitignore
поставляемый
@ -228,4 +228,5 @@ pyrightconfig.json
|
|||||||
*.ftz
|
*.ftz
|
||||||
use_model_kaggle/
|
use_model_kaggle/
|
||||||
include/
|
include/
|
||||||
.benchmarks
|
.benchmarks
|
||||||
|
.act-event.json
|
||||||
62
README.md
62
README.md
@ -1,43 +1,22 @@
|
|||||||
<h2 align="center"> <a href="https://arxiv.org/abs/2312.17338">D3lta</a></h2>
|
# D3lta
|
||||||
|
|
||||||
<h5 align="center">
|
[](https://pypi.org/project/d3lta/)
|
||||||
|
[](https://arxiv.org/abs/2312.17338)
|
||||||
If you like our project, please give us a star ⭐ on GitHub for the latest update. </h2>
|
|
||||||
|
|
||||||
</h5>
|
|
||||||
|
|
||||||
<div align=center>
|
|
||||||
|
|
||||||
[](https://arxiv.org/abs/2312.17338)
|
|
||||||
|
|
||||||
This repository is the official implementation of D3lta, a library for detecting duplicate verbatim contents within a vast amount of documents.
|
This repository is the official implementation of D3lta, a library for detecting duplicate verbatim contents within a vast amount of documents.
|
||||||
|
|
||||||
It distinguishes 3 types of duplicate contents : copypasta (almost exact duplicates), rewording and translation. You can run it on CPU.
|
It distinguishes 3 types of duplicate contents : copypasta (almost exact duplicates), rewording and translation. You can run it on CPU.
|
||||||
</div>
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
<img style="display: block; margin: auto;" src="https://github.com/VIGINUM-FR/D3lta/raw/main/static/graph.gif"/>
|
<img style="display: block; margin: auto;" src="https://github.com/VIGINUM-FR/D3lta/raw/main/static/graph.gif"/>
|
||||||
|
|
||||||
|
|
||||||
## 💻 Installing
|
## 💻 Installation
|
||||||
|
|
||||||
Clone the repository
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/VIGINUM-FR/D3lta
|
# PyPI is case insensitive, so d3lta == D3lta
|
||||||
```
|
pip install d3lta
|
||||||
|
|
||||||
Navigate to the project
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd D3lta
|
|
||||||
```
|
|
||||||
|
|
||||||
Install the package
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install -e .
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## 🚀 Quick start
|
## 🚀 Quick start
|
||||||
@ -163,11 +142,10 @@ matches, df_clusters = semantic_faiss(
|
|||||||
matches
|
matches
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## 📚 Synthetic dataset
|
## 📚 Synthetic dataset
|
||||||
|
|
||||||
The dataset is available in the release `1.0.0`. It contains the following files:
|
The dataset is available in the [`1.0.0` release](https://github.com/VIGINUM-FR/D3lta/releases/tag/1.0.0).
|
||||||
|
It contains the following files:
|
||||||
|
|
||||||
### `synthetic_dataset_documents.csv`:
|
### `synthetic_dataset_documents.csv`:
|
||||||
|
|
||||||
@ -205,10 +183,30 @@ Column details:
|
|||||||
|
|
||||||
## Notebooks
|
## Notebooks
|
||||||
|
|
||||||
In folder the [`notebooks`](./notebooks/), you can find:
|
In the [`notebooks`](./notebooks/) directory, you can find:
|
||||||
- [`example_synthetic_dataset.ipynb`](./notebooks/example_synthetic_dataset.ipynb): Example of applying threedelta methodology to the synthetic dataset, with a comparison to the true labels.
|
- [`example_synthetic_dataset.ipynb`](./notebooks/example_synthetic_dataset.ipynb): example of applying the D3lta methodology to the synthetic dataset, with a comparison to the true labels.
|
||||||
|
|
||||||
|
|
||||||
|
## 👩💻 Developing
|
||||||
|
|
||||||
|
Clone the repository
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/VIGINUM-FR/D3lta
|
||||||
|
```
|
||||||
|
|
||||||
|
Navigate to the project
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd D3lta
|
||||||
|
```
|
||||||
|
|
||||||
|
Install the package
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -e .
|
||||||
|
```
|
||||||
|
|
||||||
## Citation
|
## Citation
|
||||||
|
|
||||||
If you find our paper and code useful in your research, please consider giving a star 🌟 and a citation 📝:
|
If you find our paper and code useful in your research, please consider giving a star 🌟 and a citation 📝:
|
||||||
|
|||||||
@ -3,8 +3,6 @@ from abc import ABC, abstractmethod
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import final
|
from typing import final
|
||||||
|
|
||||||
import demoji
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class EmojisRemover(ABC):
|
class EmojisRemover(ABC):
|
||||||
@ -96,8 +94,3 @@ class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover):
|
|||||||
|
|
||||||
def _remove_symbols_implementation(self, text: str) -> str:
|
def _remove_symbols_implementation(self, text: str) -> str:
|
||||||
return self.SYMBOLS_REGEX.sub(r"", text)
|
return self.SYMBOLS_REGEX.sub(r"", text)
|
||||||
|
|
||||||
|
|
||||||
class DemojiEmojisRemover(EmojisRemover):
|
|
||||||
def _remove_symbols_implementation(self, text: str) -> str:
|
|
||||||
return demoji.replace(text)
|
|
||||||
|
|||||||
75
poetry.lock
сгенерированный
75
poetry.lock
сгенерированный
@ -155,21 +155,6 @@ files = [
|
|||||||
]
|
]
|
||||||
markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""}
|
markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""}
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "demoji"
|
|
||||||
version = "1.1.0"
|
|
||||||
description = "Accurately remove and replace emojis in text strings"
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.6"
|
|
||||||
groups = ["main"]
|
|
||||||
files = [
|
|
||||||
{file = "demoji-1.1.0-py3-none-any.whl", hash = "sha256:6d3256c909aea299e97fe984f827a2a060c2a8f8bfcbafa7ec9659967c5df50f"},
|
|
||||||
{file = "demoji-1.1.0.tar.gz", hash = "sha256:072efaeca725e6f63ab59d83abeb55b178842538ed9256455a82ebbd055ff216"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[package.extras]
|
|
||||||
ujson = ["ujson"]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "exceptiongroup"
|
name = "exceptiongroup"
|
||||||
version = "1.3.0"
|
version = "1.3.0"
|
||||||
@ -1060,18 +1045,6 @@ files = [
|
|||||||
{file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"},
|
{file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "py-cpuinfo"
|
|
||||||
version = "9.0.0"
|
|
||||||
description = "Get CPU info with pure Python"
|
|
||||||
optional = false
|
|
||||||
python-versions = "*"
|
|
||||||
groups = ["dev"]
|
|
||||||
files = [
|
|
||||||
{file = "py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690"},
|
|
||||||
{file = "py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pybind11"
|
name = "pybind11"
|
||||||
version = "2.13.6"
|
version = "2.13.6"
|
||||||
@ -1125,27 +1098,6 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""}
|
|||||||
[package.extras]
|
[package.extras]
|
||||||
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "pytest-benchmark"
|
|
||||||
version = "5.1.0"
|
|
||||||
description = "A ``pytest`` fixture for benchmarking code. It will group the tests into rounds that are calibrated to the chosen timer."
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.9"
|
|
||||||
groups = ["dev"]
|
|
||||||
files = [
|
|
||||||
{file = "pytest-benchmark-5.1.0.tar.gz", hash = "sha256:9ea661cdc292e8231f7cd4c10b0319e56a2118e2c09d9f50e1b3d150d2aca105"},
|
|
||||||
{file = "pytest_benchmark-5.1.0-py3-none-any.whl", hash = "sha256:922de2dfa3033c227c96da942d1878191afa135a29485fb942e85dff1c592c89"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[package.dependencies]
|
|
||||||
py-cpuinfo = "*"
|
|
||||||
pytest = ">=8.1"
|
|
||||||
|
|
||||||
[package.extras]
|
|
||||||
aspect = ["aspectlib"]
|
|
||||||
elasticsearch = ["elasticsearch"]
|
|
||||||
histogram = ["pygal", "pygaljs", "setuptools"]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "python-dateutil"
|
name = "python-dateutil"
|
||||||
version = "2.9.0.post0"
|
version = "2.9.0.post0"
|
||||||
@ -1291,6 +1243,28 @@ files = [
|
|||||||
{file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"},
|
{file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "smart-open"
|
||||||
|
version = "5.1.0"
|
||||||
|
description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6.*"
|
||||||
|
groups = ["main"]
|
||||||
|
markers = "python_version >= \"3.12\""
|
||||||
|
files = [
|
||||||
|
{file = "smart_open-5.1.0-py3-none-any.whl", hash = "sha256:2059b07f530c8c9e2158e4e1575309aacb74bd813da2325c1f348015d04f3bd6"},
|
||||||
|
{file = "smart_open-5.1.0.tar.gz", hash = "sha256:e4dc1350b240ef0759e343e4e2f361bfd4e5477bb2619866e97f80240652e92e"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
all = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage", "requests"]
|
||||||
|
azure = ["azure-common", "azure-core", "azure-storage-blob"]
|
||||||
|
gcs = ["google-cloud-storage"]
|
||||||
|
http = ["requests"]
|
||||||
|
s3 = ["boto3"]
|
||||||
|
test = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage", "moto[server] (==1.3.14)", "parameterizedtestcase", "paramiko", "pathlib2", "pytest", "pytest-rerunfailures", "requests", "responses"]
|
||||||
|
webhdfs = ["requests"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "smart-open"
|
name = "smart-open"
|
||||||
version = "7.1.0"
|
version = "7.1.0"
|
||||||
@ -1298,6 +1272,7 @@ description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storag
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.7"
|
python-versions = "<4.0,>=3.7"
|
||||||
groups = ["main"]
|
groups = ["main"]
|
||||||
|
markers = "python_version <= \"3.11\""
|
||||||
files = [
|
files = [
|
||||||
{file = "smart_open-7.1.0-py3-none-any.whl", hash = "sha256:4b8489bb6058196258bafe901730c7db0dcf4f083f316e97269c66f45502055b"},
|
{file = "smart_open-7.1.0-py3-none-any.whl", hash = "sha256:4b8489bb6058196258bafe901730c7db0dcf4f083f316e97269c66f45502055b"},
|
||||||
{file = "smart_open-7.1.0.tar.gz", hash = "sha256:a4f09f84f0f6d3637c6543aca7b5487438877a21360e7368ccf1f704789752ba"},
|
{file = "smart_open-7.1.0.tar.gz", hash = "sha256:a4f09f84f0f6d3637c6543aca7b5487438877a21360e7368ccf1f704789752ba"},
|
||||||
@ -1747,5 +1722,5 @@ files = [
|
|||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.1"
|
lock-version = "2.1"
|
||||||
python-versions = "^3.10"
|
python-versions = ">=3.10"
|
||||||
content-hash = "2a469cf6cd729d58a4315152a037a242fdc09dba63fe3adfe00bbb88c3f16863"
|
content-hash = "3315c4aedc40f50a78569149ca711d514b9a06b30c9c3b5a6f1402e5abf7e032"
|
||||||
|
|||||||
@ -1,32 +1,32 @@
|
|||||||
[tool.poetry]
|
[project]
|
||||||
name = "d3lta"
|
name = "d3lta"
|
||||||
version = "1.0.0.post2"
|
version = "1.0.2"
|
||||||
description = "A library for detecting verbatim-duplicated contents within a vast amount of documents"
|
description = "A library for detecting verbatim-duplicated contents within a vast amount of documents"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
authors = ["Viginum"]
|
authors = [{ name = "VIGINUM" }]
|
||||||
|
license = { file = "LICENSE.txt" }
|
||||||
|
requires-python = ">=3.10"
|
||||||
|
dependencies = [
|
||||||
|
"faiss-cpu==1.9.0.post1",
|
||||||
|
"fasttext==0.9.3",
|
||||||
|
"gensim==4.3.3",
|
||||||
|
"networkx==2.8.8",
|
||||||
|
"pandas==2.2.3",
|
||||||
|
"polyleven==0.8",
|
||||||
|
"scipy==1.12.0",
|
||||||
|
"tensorflow==2.18.0",
|
||||||
|
"tensorflow-hub==0.16.1",
|
||||||
|
"tensorflow-text==2.18.1",
|
||||||
|
"tqdm==4.67.1",
|
||||||
|
]
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
|
||||||
python = "^3.10"
|
|
||||||
demoji = "^1.1.0"
|
|
||||||
faiss-cpu = "1.9.0.post1"
|
|
||||||
fasttext = "0.9.3"
|
|
||||||
gensim = "4.3.3"
|
|
||||||
networkx = "2.8.8"
|
|
||||||
pandas = "2.2.3"
|
|
||||||
polyleven = "0.8"
|
|
||||||
scipy = "1.12.0"
|
|
||||||
tensorflow = "2.18.0"
|
|
||||||
tensorflow-hub = "0.16.1"
|
|
||||||
tensorflow-text = "2.18.1"
|
|
||||||
tqdm = "4.67.1"
|
|
||||||
|
|
||||||
[tool.poetry.group.dev]
|
[tool.poetry.group.dev]
|
||||||
optional = true
|
optional = true
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
pytest = "^8.3.5"
|
pytest = "^8.3.5"
|
||||||
pytest-benchmark = "^5.1.0"
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["setuptools", "poetry-core"]
|
requires = ["poetry-core"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|||||||
4
setup.py
4
setup.py
@ -1,4 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
from setuptools import setup
|
|
||||||
|
|
||||||
setup()
|
|
||||||
@ -6,9 +6,6 @@ from get_unicode_emojis_list import (
|
|||||||
EMOJI_TESTFILE_FILENAME,
|
EMOJI_TESTFILE_FILENAME,
|
||||||
get_all_emojis_from_latest_unicode_emojis_specification_with_download,
|
get_all_emojis_from_latest_unicode_emojis_specification_with_download,
|
||||||
)
|
)
|
||||||
from pytest_benchmark.fixture import (
|
|
||||||
BenchmarkFixture,
|
|
||||||
)
|
|
||||||
|
|
||||||
import d3lta.emojis_remover
|
import d3lta.emojis_remover
|
||||||
|
|
||||||
@ -17,13 +14,6 @@ import d3lta.emojis_remover
|
|||||||
name="emojis_remover",
|
name="emojis_remover",
|
||||||
params=[
|
params=[
|
||||||
d3lta.emojis_remover.ExplicitUnicodeBlocksEmojisRemover,
|
d3lta.emojis_remover.ExplicitUnicodeBlocksEmojisRemover,
|
||||||
pytest.param(
|
|
||||||
d3lta.emojis_remover.DemojiEmojisRemover,
|
|
||||||
marks=pytest.mark.xfail(
|
|
||||||
reason="`demoji`'s detection engine does not detect all emojis in the Unicode specification",
|
|
||||||
strict=True,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def fixture_emojis_remover(
|
def fixture_emojis_remover(
|
||||||
@ -108,11 +98,8 @@ In consequence whereof, the National 🏞️ Assembly 👩🏭👨🏭 r
|
|||||||
def test_on_text_sample(
|
def test_on_text_sample(
|
||||||
emojis_remover: d3lta.emojis_remover.EmojisRemover,
|
emojis_remover: d3lta.emojis_remover.EmojisRemover,
|
||||||
sample_text_with_emojipasta: str,
|
sample_text_with_emojipasta: str,
|
||||||
sample_text: str,
|
sample_text: str
|
||||||
benchmark: BenchmarkFixture,
|
|
||||||
):
|
):
|
||||||
processed = benchmark(
|
assert emojis_remover.remove_symbols(
|
||||||
emojis_remover.remove_symbols,
|
|
||||||
sample_text_with_emojipasta,
|
sample_text_with_emojipasta,
|
||||||
)
|
) == sample_text
|
||||||
assert processed == sample_text
|
|
||||||
|
|||||||
Загрузка…
x
Ссылка в новой задаче
Block a user