зеркало из
				https://github.com/VIGINUM-FR/D3lta.git
				synced 2025-10-30 21:46:11 +02:00 
			
		
		
		
	Сравнить коммиты
	
		
			26 Коммитов
		
	
	
		
	
	| Автор | SHA1 | Дата | |
|---|---|---|---|
|   | 8df5772178 | ||
|   | a18992748e | ||
|   | 045bd4becf | ||
|   | 3cdea198bb | ||
|   | 1bd593cf43 | ||
|   | b41ab2ce19 | ||
|   | 1151e21254 | ||
|   | b8fada79c2 | ||
|   | 95a07bd5a3 | ||
|   | e0c747f43c | ||
|   | 588f20cd4a | ||
|   | a92770562b | ||
|   | 991ed8141b | ||
|   | f5f71cca37 | ||
|   | b1d2b93c24 | ||
|   | 71a76b0d3a | ||
|   | ed3f0b9db3 | ||
|   | 8999d23448 | ||
|   | 427a873568 | ||
|   | 0386589b46 | ||
|   | c589aebc41 | ||
|   | 56a1f07c1e | ||
|   | fb7531405c | ||
|   | 80f12d6ee9 | ||
|   | eb1599ee10 | ||
|   | c7107aae1d | 
| @ -17,7 +17,7 @@ RUN pip install --no-cache-dir --upgrade pip | |||||||
| RUN pipx install poetry==${POETRY_VERSION} | RUN pipx install poetry==${POETRY_VERSION} | ||||||
| 
 | 
 | ||||||
| WORKDIR /app | WORKDIR /app | ||||||
| COPY pyproject.toml poetry.lock setup.py README.md ./ | COPY pyproject.toml poetry.lock README.md LICENSE.txt ./ | ||||||
| # pre-install dependencies | # pre-install dependencies | ||||||
| RUN --mount=type=cache,target=/root/.cache poetry install --no-root | RUN --mount=type=cache,target=/root/.cache poetry install --no-root | ||||||
| 
 | 
 | ||||||
| @ -28,3 +28,7 @@ RUN --mount=type=cache,target=/root/.cache poetry install | |||||||
| FROM d3lta-prod AS d3lta-dev | FROM d3lta-prod AS d3lta-dev | ||||||
| 
 | 
 | ||||||
| RUN --mount=type=cache,target=/root/.cache poetry install --with dev | RUN --mount=type=cache,target=/root/.cache poetry install --with dev | ||||||
|  | 
 | ||||||
|  | # install nektos/act as specified in https://nektosact.com/installation/index.html#bash-script | ||||||
|  | # the -b flag specifies the target directory (cf. https://github.com/nektos/act/blob/61396d8085a9d812cebf94fa954f5938d48bf2b9/install.sh#L13) | ||||||
|  | RUN curl --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash -s -- -b /usr/bin | ||||||
|  | |||||||
| @ -4,6 +4,14 @@ | |||||||
| 	"name": "Python 3", | 	"name": "Python 3", | ||||||
| 	"dockerFile": "./Dockerfile", | 	"dockerFile": "./Dockerfile", | ||||||
| 	"context": "..", | 	"context": "..", | ||||||
| 	// Features to add to the dev container. More info: https://containers.dev/features. | 	"customizations": { | ||||||
| 	// "features": {} | 		"vscode": { | ||||||
|  | 			"extensions": [ | ||||||
|  | 				"github.vscode-github-actions" | ||||||
|  | 			] | ||||||
|  | 		} | ||||||
|  | 	}, | ||||||
|  | 	"features": { | ||||||
|  | 		"ghcr.io/devcontainers/features/docker-in-docker:2": {} | ||||||
|  | 	} | ||||||
| } | } | ||||||
							
								
								
									
										156
									
								
								.github/workflows/publish-to-pypi.yml
									
									
									
									
										поставляемый
									
									
								
							
							
						
						
									
										156
									
								
								.github/workflows/publish-to-pypi.yml
									
									
									
									
										поставляемый
									
									
								
							| @ -1,63 +1,69 @@ | |||||||
| # derived from https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/#the-whole-ci-cd-workflow | # derived from https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/#the-whole-ci-cd-workflow | ||||||
| name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI | name: Publish Python distribution to PyPI | ||||||
| 
 | 
 | ||||||
| on: push | on: | ||||||
|  |   release: | ||||||
|  |     types: [published] | ||||||
|  | 
 | ||||||
|  | env: | ||||||
|  |   ACT: false  # env.ACT == true when running inside nektos/act | ||||||
| 
 | 
 | ||||||
| jobs: | jobs: | ||||||
|   unit-test: |  | ||||||
|     name: Run unit tests 🤾 |  | ||||||
|     runs-on: ubuntu-latest |  | ||||||
| 
 |  | ||||||
|     steps: |  | ||||||
|     - uses: actions/checkout@v4 |  | ||||||
|       with: |  | ||||||
|         persist-credentials: false |  | ||||||
|     - name: Set up Python |  | ||||||
|       uses: actions/setup-python@v5 |  | ||||||
|       with: |  | ||||||
|         python-version: "3.11" |  | ||||||
|     - name: Install dependencies |  | ||||||
|       run: | |  | ||||||
|         pip install poetry |  | ||||||
|         poetry install --with dev  |  | ||||||
|     - name: Build a binary wheel and a source tarball |  | ||||||
|       run:  |  | ||||||
|         poetry run pytest |  | ||||||
| 
 |  | ||||||
|   build: |   build: | ||||||
|     name: Build distribution 📦 |     name: Build distribution | ||||||
|     needs: |     # based on https://stackoverflow.com/a/74318141 | ||||||
|     - unit-test |     if: ${{ github.event.release.target_commitish == 'main'}} | ||||||
|     runs-on: ubuntu-latest |     runs-on: ubuntu-latest | ||||||
| 
 | 
 | ||||||
|     steps: |     steps: | ||||||
|     - uses: actions/checkout@v4 |     - uses: actions/checkout@v4 | ||||||
|       with: |       with: | ||||||
|         persist-credentials: false |         persist-credentials: false | ||||||
|  | 
 | ||||||
|     - name: Set up Python |     - name: Set up Python | ||||||
|       uses: actions/setup-python@v5 |       uses: actions/setup-python@v5 | ||||||
|       with: |       with: | ||||||
|         python-version: "3.x" |         python-version: "3.x" | ||||||
|     - name: Install pypa/build | 
 | ||||||
|       run: >- |  | ||||||
|         python3 -m |  | ||||||
|         pip install |  | ||||||
|         build |  | ||||||
|         --user |  | ||||||
|     - name: Build a binary wheel and a source tarball |     - name: Build a binary wheel and a source tarball | ||||||
|       run: python3 -m build |       run: pipx run build | ||||||
|  | 
 | ||||||
|     - name: Store the distribution packages |     - name: Store the distribution packages | ||||||
|       uses: actions/upload-artifact@v4 |       uses: actions/upload-artifact@v4 | ||||||
|       with: |       with: | ||||||
|         name: python-package-distributions |         name: distfiles | ||||||
|  |         path: dist/ | ||||||
|  |         if-no-files-found: error | ||||||
|  | 
 | ||||||
|  |   # taken from https://github.com/python-poetry/poetry/blob/b580e8aa4fbce53569420e7b42568dfd9e73519f/.github/workflows/release.yaml | ||||||
|  |   upload-built-distribution-to-github-release: | ||||||
|  |     name: Upload (GitHub) | ||||||
|  |     runs-on: ubuntu-latest | ||||||
|  |     permissions: | ||||||
|  |       contents: write | ||||||
|  |     needs: build | ||||||
|  |     steps: | ||||||
|  |       # Checking-out the project since the gh CLI expects to be called in the context of a git repository. | ||||||
|  |       - uses: actions/checkout@v4 | ||||||
|  |         with: | ||||||
|  |           persist-credentials: false | ||||||
|  | 
 | ||||||
|  |       - name: Retrieve built distribution | ||||||
|  |         uses: actions/download-artifact@v4 | ||||||
|  |         with: | ||||||
|  |           name: distfiles | ||||||
|           path: dist/ |           path: dist/ | ||||||
| 
 | 
 | ||||||
|  |       - run: gh release upload "${TAG_NAME}" dist/*.{tar.gz,whl} | ||||||
|  |         # skip step when debugging locally via nektos/act | ||||||
|  |         if: ${{ !env.ACT }} | ||||||
|  |         env: | ||||||
|  |           GH_TOKEN: ${{ github.token }} | ||||||
|  |           TAG_NAME: ${{ github.event.release.tag_name }} | ||||||
|  | 
 | ||||||
|   publish-to-pypi: |   publish-to-pypi: | ||||||
|     name: >- |     name: Publish Python distribution to PyPI | ||||||
|       Publish Python 🐍 distribution 📦 to PyPI |     needs: build | ||||||
|     if: startsWith(github.ref, 'refs/tags/')  # only publish to PyPI on tag pushes |  | ||||||
|     needs: |  | ||||||
|     - build |  | ||||||
|     runs-on: ubuntu-latest |     runs-on: ubuntu-latest | ||||||
|     environment: |     environment: | ||||||
|       name: pypi |       name: pypi | ||||||
| @ -66,77 +72,15 @@ jobs: | |||||||
|       id-token: write  # IMPORTANT: mandatory for trusted publishing |       id-token: write  # IMPORTANT: mandatory for trusted publishing | ||||||
| 
 | 
 | ||||||
|     steps: |     steps: | ||||||
|     - name: Download all the dists |       - name: Retrieve built distribution | ||||||
|         uses: actions/download-artifact@v4 |         uses: actions/download-artifact@v4 | ||||||
|         with: |         with: | ||||||
|         name: python-package-distributions |           name: distfiles | ||||||
|           path: dist/ |           path: dist/ | ||||||
|     - name: Publish distribution 📦 to PyPI |  | ||||||
|       uses: pypa/gh-action-pypi-publish@release/v1 |  | ||||||
| 
 | 
 | ||||||
|   github-release: |       - name: Publish distribution to PyPI | ||||||
|     name: >- |         # skip step when debugging locally via nektos/act | ||||||
|       Sign the Python 🐍 distribution 📦 with Sigstore |         if: ${{ !env.ACT }} | ||||||
|       and upload them to GitHub Release |  | ||||||
|     needs: |  | ||||||
|     - publish-to-pypi |  | ||||||
|     runs-on: ubuntu-latest |  | ||||||
| 
 |  | ||||||
|     permissions: |  | ||||||
|       contents: write  # IMPORTANT: mandatory for making GitHub Releases |  | ||||||
|       id-token: write  # IMPORTANT: mandatory for sigstore |  | ||||||
| 
 |  | ||||||
|     steps: |  | ||||||
|     - name: Download all the dists |  | ||||||
|       uses: actions/download-artifact@v4 |  | ||||||
|       with: |  | ||||||
|         name: python-package-distributions |  | ||||||
|         path: dist/ |  | ||||||
|     - name: Sign the dists with Sigstore |  | ||||||
|       uses: sigstore/gh-action-sigstore-python@v3.0.0 |  | ||||||
|       with: |  | ||||||
|         inputs: >- |  | ||||||
|           ./dist/*.tar.gz |  | ||||||
|           ./dist/*.whl |  | ||||||
|     - name: Create GitHub Release |  | ||||||
|       env: |  | ||||||
|         GITHUB_TOKEN: ${{ github.token }} |  | ||||||
|       run: >- |  | ||||||
|         gh release create |  | ||||||
|         "$GITHUB_REF_NAME" |  | ||||||
|         --repo "$GITHUB_REPOSITORY" |  | ||||||
|         --notes "" |  | ||||||
|     - name: Upload artifact signatures to GitHub Release |  | ||||||
|       env: |  | ||||||
|         GITHUB_TOKEN: ${{ github.token }} |  | ||||||
|       # Upload to GitHub Release using the `gh` CLI. |  | ||||||
|       # `dist/` contains the built packages, and the |  | ||||||
|       # sigstore-produced signatures and certificates. |  | ||||||
|       run: >- |  | ||||||
|         gh release upload |  | ||||||
|         "$GITHUB_REF_NAME" dist/** |  | ||||||
|         --repo "$GITHUB_REPOSITORY" |  | ||||||
| 
 |  | ||||||
|   publish-to-testpypi: |  | ||||||
|     name: Publish Python 🐍 distribution 📦 to TestPyPI |  | ||||||
|     needs: |  | ||||||
|     - build |  | ||||||
|     runs-on: ubuntu-latest |  | ||||||
| 
 |  | ||||||
|     environment: |  | ||||||
|       name: testpypi |  | ||||||
|       url: https://test.pypi.org/p/d3lta  # pypi is case insensitive so d3lta == D3lta |  | ||||||
| 
 |  | ||||||
|     permissions: |  | ||||||
|       id-token: write  # IMPORTANT: mandatory for trusted publishing |  | ||||||
| 
 |  | ||||||
|     steps: |  | ||||||
|     - name: Download all the dists |  | ||||||
|       uses: actions/download-artifact@v4 |  | ||||||
|       with: |  | ||||||
|         name: python-package-distributions |  | ||||||
|         path: dist/ |  | ||||||
|     - name: Publish distribution 📦 to TestPyPI |  | ||||||
|         uses: pypa/gh-action-pypi-publish@release/v1 |         uses: pypa/gh-action-pypi-publish@release/v1 | ||||||
|         with: |         with: | ||||||
|         repository-url: https://test.pypi.org/legacy/ |           print-hash: true | ||||||
|  | |||||||
							
								
								
									
										28
									
								
								.github/workflows/test.yml
									
									
									
									
										поставляемый
									
									
										Обычный файл
									
								
							
							
						
						
									
										28
									
								
								.github/workflows/test.yml
									
									
									
									
										поставляемый
									
									
										Обычный файл
									
								
							| @ -0,0 +1,28 @@ | |||||||
|  | name: Run tests | ||||||
|  | 
 | ||||||
|  | on: push | ||||||
|  | 
 | ||||||
|  | jobs: | ||||||
|  |   unit-test: | ||||||
|  |     name: Run unit tests | ||||||
|  |     runs-on: ubuntu-latest | ||||||
|  | 
 | ||||||
|  |     steps: | ||||||
|  |     - uses: actions/checkout@v4 | ||||||
|  |       with: | ||||||
|  |         persist-credentials: false | ||||||
|  | 
 | ||||||
|  |     - name: Install poetry | ||||||
|  |       run: pipx install poetry | ||||||
|  | 
 | ||||||
|  |     - name: Set up Python | ||||||
|  |       uses: actions/setup-python@v5 | ||||||
|  |       with: | ||||||
|  |         python-version: "3.11" | ||||||
|  |         cache: poetry | ||||||
|  | 
 | ||||||
|  |     - name: Install dependencies | ||||||
|  |       run: poetry install --with dev | ||||||
|  | 
 | ||||||
|  |     - name: Run tests | ||||||
|  |       run: poetry run pytest | ||||||
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										поставляемый
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										поставляемый
									
									
								
							| @ -229,3 +229,4 @@ pyrightconfig.json | |||||||
| use_model_kaggle/ | use_model_kaggle/ | ||||||
| include/ | include/ | ||||||
| .benchmarks | .benchmarks | ||||||
|  | .act-event.json | ||||||
							
								
								
									
										60
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										60
									
								
								README.md
									
									
									
									
									
								
							| @ -1,43 +1,22 @@ | |||||||
| <h2 align="center"> <a href="https://arxiv.org/abs/2312.17338">D3lta</a></h2> | # D3lta | ||||||
| 
 |  | ||||||
| <h5 align="center">  |  | ||||||
| 
 |  | ||||||
| If you like our project, please give us a star ⭐ on GitHub for the latest update.  </h2> |  | ||||||
| 
 |  | ||||||
| </h5> |  | ||||||
| 
 |  | ||||||
| <div align=center> |  | ||||||
| 
 | 
 | ||||||
|  | [](https://pypi.org/project/d3lta/) | ||||||
| [](https://arxiv.org/abs/2312.17338) | [](https://arxiv.org/abs/2312.17338) | ||||||
| 
 | 
 | ||||||
| This repository is the official implementation of D3lta, a library for detecting duplicate verbatim contents within a vast amount of documents. | This repository is the official implementation of D3lta, a library for detecting duplicate verbatim contents within a vast amount of documents. | ||||||
| 
 | 
 | ||||||
| It distinguishes 3 types of duplicate contents : copypasta (almost exact duplicates), rewording and translation. You can run it on CPU. | It distinguishes 3 types of duplicate contents : copypasta (almost exact duplicates), rewording and translation. You can run it on CPU. | ||||||
| </div> |  | ||||||
| 
 | 
 | ||||||
| --- | --- | ||||||
| 
 | 
 | ||||||
| <img style="display: block; margin: auto;" src="https://github.com/VIGINUM-FR/D3lta/raw/main/static/graph.gif"/> | <img style="display: block; margin: auto;" src="https://github.com/VIGINUM-FR/D3lta/raw/main/static/graph.gif"/> | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| ## 💻 Installing  | ## 💻 Installation | ||||||
| 
 |  | ||||||
| Clone the repository |  | ||||||
| 
 | 
 | ||||||
| ```bash | ```bash | ||||||
| git clone https://github.com/VIGINUM-FR/D3lta | # PyPI is case insensitive, so d3lta == D3lta | ||||||
| ``` | pip install d3lta | ||||||
| 
 |  | ||||||
| Navigate to the project |  | ||||||
| 
 |  | ||||||
| ```bash |  | ||||||
| cd D3lta |  | ||||||
| ``` |  | ||||||
| 
 |  | ||||||
| Install the package |  | ||||||
| 
 |  | ||||||
| ```bash |  | ||||||
| pip install -e . |  | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| ## 🚀 Quick start | ## 🚀 Quick start | ||||||
| @ -163,11 +142,10 @@ matches, df_clusters = semantic_faiss( | |||||||
| matches | matches | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| ## 📚 Synthetic dataset | ## 📚 Synthetic dataset | ||||||
| 
 | 
 | ||||||
| The dataset is available in the release `1.0.0`. It contains the following files: | The dataset is available in the [`1.0.0` release](https://github.com/VIGINUM-FR/D3lta/releases/tag/1.0.0). | ||||||
|  | It contains the following files: | ||||||
| 
 | 
 | ||||||
| ### `synthetic_dataset_documents.csv`: | ### `synthetic_dataset_documents.csv`: | ||||||
| 
 | 
 | ||||||
| @ -205,10 +183,30 @@ Column details: | |||||||
| 
 | 
 | ||||||
| ## Notebooks | ## Notebooks | ||||||
| 
 | 
 | ||||||
| In folder the [`notebooks`](./notebooks/), you can find:  | In the [`notebooks`](./notebooks/) directory, you can find:  | ||||||
| - [`example_synthetic_dataset.ipynb`](./notebooks/example_synthetic_dataset.ipynb): Example of applying threedelta methodology to the synthetic dataset, with a comparison to the true labels. | - [`example_synthetic_dataset.ipynb`](./notebooks/example_synthetic_dataset.ipynb): example of applying the D3lta methodology to the synthetic dataset, with a comparison to the true labels. | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | ## 👩💻 Developing | ||||||
|  | 
 | ||||||
|  | Clone the repository | ||||||
|  | 
 | ||||||
|  | ```bash | ||||||
|  | git clone https://github.com/VIGINUM-FR/D3lta | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | Navigate to the project | ||||||
|  | 
 | ||||||
|  | ```bash | ||||||
|  | cd D3lta | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | Install the package | ||||||
|  | 
 | ||||||
|  | ```bash | ||||||
|  | pip install -e . | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
| ## Citation | ## Citation | ||||||
| 
 | 
 | ||||||
| If you find our paper and code useful in your research, please consider giving a star 🌟  and a citation 📝: | If you find our paper and code useful in your research, please consider giving a star 🌟  and a citation 📝: | ||||||
|  | |||||||
| @ -3,8 +3,6 @@ from abc import ABC, abstractmethod | |||||||
| from dataclasses import dataclass | from dataclasses import dataclass | ||||||
| from typing import final | from typing import final | ||||||
| 
 | 
 | ||||||
| import demoji |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| @dataclass | @dataclass | ||||||
| class EmojisRemover(ABC): | class EmojisRemover(ABC): | ||||||
| @ -96,8 +94,3 @@ class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover): | |||||||
| 
 | 
 | ||||||
|     def _remove_symbols_implementation(self, text: str) -> str: |     def _remove_symbols_implementation(self, text: str) -> str: | ||||||
|         return self.SYMBOLS_REGEX.sub(r"", text) |         return self.SYMBOLS_REGEX.sub(r"", text) | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class DemojiEmojisRemover(EmojisRemover): |  | ||||||
|     def _remove_symbols_implementation(self, text: str) -> str: |  | ||||||
|         return demoji.replace(text) |  | ||||||
|  | |||||||
							
								
								
									
										75
									
								
								poetry.lock
									
									
									
										сгенерированный
									
									
									
								
							
							
						
						
									
										75
									
								
								poetry.lock
									
									
									
										сгенерированный
									
									
									
								
							| @ -155,21 +155,6 @@ files = [ | |||||||
| ] | ] | ||||||
| markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""} | markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""} | ||||||
| 
 | 
 | ||||||
| [[package]] |  | ||||||
| name = "demoji" |  | ||||||
| version = "1.1.0" |  | ||||||
| description = "Accurately remove and replace emojis in text strings" |  | ||||||
| optional = false |  | ||||||
| python-versions = ">=3.6" |  | ||||||
| groups = ["main"] |  | ||||||
| files = [ |  | ||||||
|     {file = "demoji-1.1.0-py3-none-any.whl", hash = "sha256:6d3256c909aea299e97fe984f827a2a060c2a8f8bfcbafa7ec9659967c5df50f"}, |  | ||||||
|     {file = "demoji-1.1.0.tar.gz", hash = "sha256:072efaeca725e6f63ab59d83abeb55b178842538ed9256455a82ebbd055ff216"}, |  | ||||||
| ] |  | ||||||
| 
 |  | ||||||
| [package.extras] |  | ||||||
| ujson = ["ujson"] |  | ||||||
| 
 |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "exceptiongroup" | name = "exceptiongroup" | ||||||
| version = "1.3.0" | version = "1.3.0" | ||||||
| @ -1060,18 +1045,6 @@ files = [ | |||||||
|     {file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"}, |     {file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"}, | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| [[package]] |  | ||||||
| name = "py-cpuinfo" |  | ||||||
| version = "9.0.0" |  | ||||||
| description = "Get CPU info with pure Python" |  | ||||||
| optional = false |  | ||||||
| python-versions = "*" |  | ||||||
| groups = ["dev"] |  | ||||||
| files = [ |  | ||||||
|     {file = "py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690"}, |  | ||||||
|     {file = "py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5"}, |  | ||||||
| ] |  | ||||||
| 
 |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "pybind11" | name = "pybind11" | ||||||
| version = "2.13.6" | version = "2.13.6" | ||||||
| @ -1125,27 +1098,6 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""} | |||||||
| [package.extras] | [package.extras] | ||||||
| dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] | dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] | ||||||
| 
 | 
 | ||||||
| [[package]] |  | ||||||
| name = "pytest-benchmark" |  | ||||||
| version = "5.1.0" |  | ||||||
| description = "A ``pytest`` fixture for benchmarking code. It will group the tests into rounds that are calibrated to the chosen timer." |  | ||||||
| optional = false |  | ||||||
| python-versions = ">=3.9" |  | ||||||
| groups = ["dev"] |  | ||||||
| files = [ |  | ||||||
|     {file = "pytest-benchmark-5.1.0.tar.gz", hash = "sha256:9ea661cdc292e8231f7cd4c10b0319e56a2118e2c09d9f50e1b3d150d2aca105"}, |  | ||||||
|     {file = "pytest_benchmark-5.1.0-py3-none-any.whl", hash = "sha256:922de2dfa3033c227c96da942d1878191afa135a29485fb942e85dff1c592c89"}, |  | ||||||
| ] |  | ||||||
| 
 |  | ||||||
| [package.dependencies] |  | ||||||
| py-cpuinfo = "*" |  | ||||||
| pytest = ">=8.1" |  | ||||||
| 
 |  | ||||||
| [package.extras] |  | ||||||
| aspect = ["aspectlib"] |  | ||||||
| elasticsearch = ["elasticsearch"] |  | ||||||
| histogram = ["pygal", "pygaljs", "setuptools"] |  | ||||||
| 
 |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "python-dateutil" | name = "python-dateutil" | ||||||
| version = "2.9.0.post0" | version = "2.9.0.post0" | ||||||
| @ -1291,6 +1243,28 @@ files = [ | |||||||
|     {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, |     {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "smart-open" | ||||||
|  | version = "5.1.0" | ||||||
|  | description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)" | ||||||
|  | optional = false | ||||||
|  | python-versions = ">=3.6.*" | ||||||
|  | groups = ["main"] | ||||||
|  | markers = "python_version >= \"3.12\"" | ||||||
|  | files = [ | ||||||
|  |     {file = "smart_open-5.1.0-py3-none-any.whl", hash = "sha256:2059b07f530c8c9e2158e4e1575309aacb74bd813da2325c1f348015d04f3bd6"}, | ||||||
|  |     {file = "smart_open-5.1.0.tar.gz", hash = "sha256:e4dc1350b240ef0759e343e4e2f361bfd4e5477bb2619866e97f80240652e92e"}, | ||||||
|  | ] | ||||||
|  | 
 | ||||||
|  | [package.extras] | ||||||
|  | all = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage", "requests"] | ||||||
|  | azure = ["azure-common", "azure-core", "azure-storage-blob"] | ||||||
|  | gcs = ["google-cloud-storage"] | ||||||
|  | http = ["requests"] | ||||||
|  | s3 = ["boto3"] | ||||||
|  | test = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage", "moto[server] (==1.3.14)", "parameterizedtestcase", "paramiko", "pathlib2", "pytest", "pytest-rerunfailures", "requests", "responses"] | ||||||
|  | webhdfs = ["requests"] | ||||||
|  | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "smart-open" | name = "smart-open" | ||||||
| version = "7.1.0" | version = "7.1.0" | ||||||
| @ -1298,6 +1272,7 @@ description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storag | |||||||
| optional = false | optional = false | ||||||
| python-versions = "<4.0,>=3.7" | python-versions = "<4.0,>=3.7" | ||||||
| groups = ["main"] | groups = ["main"] | ||||||
|  | markers = "python_version <= \"3.11\"" | ||||||
| files = [ | files = [ | ||||||
|     {file = "smart_open-7.1.0-py3-none-any.whl", hash = "sha256:4b8489bb6058196258bafe901730c7db0dcf4f083f316e97269c66f45502055b"}, |     {file = "smart_open-7.1.0-py3-none-any.whl", hash = "sha256:4b8489bb6058196258bafe901730c7db0dcf4f083f316e97269c66f45502055b"}, | ||||||
|     {file = "smart_open-7.1.0.tar.gz", hash = "sha256:a4f09f84f0f6d3637c6543aca7b5487438877a21360e7368ccf1f704789752ba"}, |     {file = "smart_open-7.1.0.tar.gz", hash = "sha256:a4f09f84f0f6d3637c6543aca7b5487438877a21360e7368ccf1f704789752ba"}, | ||||||
| @ -1747,5 +1722,5 @@ files = [ | |||||||
| 
 | 
 | ||||||
| [metadata] | [metadata] | ||||||
| lock-version = "2.1" | lock-version = "2.1" | ||||||
| python-versions = "^3.10" | python-versions = ">=3.10" | ||||||
| content-hash = "2a469cf6cd729d58a4315152a037a242fdc09dba63fe3adfe00bbb88c3f16863" | content-hash = "3315c4aedc40f50a78569149ca711d514b9a06b30c9c3b5a6f1402e5abf7e032" | ||||||
|  | |||||||
| @ -1,32 +1,32 @@ | |||||||
| [tool.poetry] | [project] | ||||||
| name = "d3lta" | name = "d3lta" | ||||||
| version = "1.0.1" | version = "1.0.2" | ||||||
| description = "A library for detecting verbatim-duplicated contents within a vast amount of documents" | description = "A library for detecting verbatim-duplicated contents within a vast amount of documents" | ||||||
| readme = "README.md" | readme = "README.md" | ||||||
| authors = ["Viginum"] | authors = [{ name = "VIGINUM" }] | ||||||
|  | license = { file = "LICENSE.txt" } | ||||||
|  | requires-python = ">=3.10" | ||||||
|  | dependencies = [ | ||||||
|  |     "faiss-cpu==1.9.0.post1", | ||||||
|  |     "fasttext==0.9.3", | ||||||
|  |     "gensim==4.3.3", | ||||||
|  |     "networkx==2.8.8", | ||||||
|  |     "pandas==2.2.3", | ||||||
|  |     "polyleven==0.8", | ||||||
|  |     "scipy==1.12.0", | ||||||
|  |     "tensorflow==2.18.0", | ||||||
|  |     "tensorflow-hub==0.16.1", | ||||||
|  |     "tensorflow-text==2.18.1", | ||||||
|  |     "tqdm==4.67.1", | ||||||
|  | ] | ||||||
| 
 | 
 | ||||||
| [tool.poetry.dependencies] |  | ||||||
| python = "^3.10" |  | ||||||
| demoji = "^1.1.0" |  | ||||||
| faiss-cpu = "1.9.0.post1" |  | ||||||
| fasttext = "0.9.3" |  | ||||||
| gensim = "4.3.3" |  | ||||||
| networkx = "2.8.8" |  | ||||||
| pandas = "2.2.3" |  | ||||||
| polyleven = "0.8" |  | ||||||
| scipy = "1.12.0" |  | ||||||
| tensorflow = "2.18.0" |  | ||||||
| tensorflow-hub = "0.16.1" |  | ||||||
| tensorflow-text = "2.18.1" |  | ||||||
| tqdm = "4.67.1" |  | ||||||
| 
 | 
 | ||||||
| [tool.poetry.group.dev] | [tool.poetry.group.dev] | ||||||
| optional = true | optional = true | ||||||
| 
 | 
 | ||||||
| [tool.poetry.group.dev.dependencies] | [tool.poetry.group.dev.dependencies] | ||||||
| pytest = "^8.3.5" | pytest = "^8.3.5" | ||||||
| pytest-benchmark = "^5.1.0" |  | ||||||
| 
 | 
 | ||||||
| [build-system] | [build-system] | ||||||
| requires = ["setuptools", "poetry-core"] | requires = ["poetry-core"] | ||||||
| build-backend = "poetry.core.masonry.api" | build-backend = "poetry.core.masonry.api" | ||||||
|  | |||||||
							
								
								
									
										4
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								setup.py
									
									
									
									
									
								
							| @ -1,4 +0,0 @@ | |||||||
| #!/usr/bin/env python |  | ||||||
| from setuptools import setup |  | ||||||
| 
 |  | ||||||
| setup() |  | ||||||
| @ -6,9 +6,6 @@ from get_unicode_emojis_list import ( | |||||||
|     EMOJI_TESTFILE_FILENAME, |     EMOJI_TESTFILE_FILENAME, | ||||||
|     get_all_emojis_from_latest_unicode_emojis_specification_with_download, |     get_all_emojis_from_latest_unicode_emojis_specification_with_download, | ||||||
| ) | ) | ||||||
| from pytest_benchmark.fixture import ( |  | ||||||
|     BenchmarkFixture, |  | ||||||
| ) |  | ||||||
| 
 | 
 | ||||||
| import d3lta.emojis_remover | import d3lta.emojis_remover | ||||||
| 
 | 
 | ||||||
| @ -17,13 +14,6 @@ import d3lta.emojis_remover | |||||||
|     name="emojis_remover", |     name="emojis_remover", | ||||||
|     params=[ |     params=[ | ||||||
|         d3lta.emojis_remover.ExplicitUnicodeBlocksEmojisRemover, |         d3lta.emojis_remover.ExplicitUnicodeBlocksEmojisRemover, | ||||||
|         pytest.param( |  | ||||||
|             d3lta.emojis_remover.DemojiEmojisRemover, |  | ||||||
|             marks=pytest.mark.xfail( |  | ||||||
|                 reason="`demoji`'s detection engine does not detect all emojis in the Unicode specification", |  | ||||||
|                 strict=True, |  | ||||||
|             ), |  | ||||||
|         ), |  | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def fixture_emojis_remover( | def fixture_emojis_remover( | ||||||
| @ -108,11 +98,8 @@ In consequence whereof, the National 🏞️  Assembly 👩🏭👨🏭  r | |||||||
| def test_on_text_sample( | def test_on_text_sample( | ||||||
|     emojis_remover: d3lta.emojis_remover.EmojisRemover, |     emojis_remover: d3lta.emojis_remover.EmojisRemover, | ||||||
|     sample_text_with_emojipasta: str, |     sample_text_with_emojipasta: str, | ||||||
|     sample_text: str, |     sample_text: str | ||||||
|     benchmark: BenchmarkFixture, |  | ||||||
| ): | ): | ||||||
|     processed = benchmark( |     assert emojis_remover.remove_symbols( | ||||||
|         emojis_remover.remove_symbols, |  | ||||||
|         sample_text_with_emojipasta, |         sample_text_with_emojipasta, | ||||||
|     ) |     ) == sample_text | ||||||
|     assert processed == sample_text |  | ||||||
|  | |||||||
		Загрузка…
	
	
			
			x
			
			
		
	
		Ссылка в новой задаче
	
	Block a user