зеркало из
				https://github.com/VIGINUM-FR/D3lta.git
				synced 2025-10-31 05:56:20 +02:00 
			
		
		
		
	Сравнить коммиты
	
		
			26 Коммитов
		
	
	
		
	
	| Автор | SHA1 | Дата | |
|---|---|---|---|
|   | 8df5772178 | ||
|   | a18992748e | ||
|   | 045bd4becf | ||
|   | 3cdea198bb | ||
|   | 1bd593cf43 | ||
|   | b41ab2ce19 | ||
|   | 1151e21254 | ||
|   | b8fada79c2 | ||
|   | 95a07bd5a3 | ||
|   | e0c747f43c | ||
|   | 588f20cd4a | ||
|   | a92770562b | ||
|   | 991ed8141b | ||
|   | f5f71cca37 | ||
|   | b1d2b93c24 | ||
|   | 71a76b0d3a | ||
|   | ed3f0b9db3 | ||
|   | 8999d23448 | ||
|   | 427a873568 | ||
|   | 0386589b46 | ||
|   | c589aebc41 | ||
|   | 56a1f07c1e | ||
|   | fb7531405c | ||
|   | 80f12d6ee9 | ||
|   | eb1599ee10 | ||
|   | c7107aae1d | 
| @ -17,7 +17,7 @@ RUN pip install --no-cache-dir --upgrade pip | ||||
| RUN pipx install poetry==${POETRY_VERSION} | ||||
| 
 | ||||
| WORKDIR /app | ||||
| COPY pyproject.toml poetry.lock setup.py README.md ./ | ||||
| COPY pyproject.toml poetry.lock README.md LICENSE.txt ./ | ||||
| # pre-install dependencies | ||||
| RUN --mount=type=cache,target=/root/.cache poetry install --no-root | ||||
| 
 | ||||
| @ -28,3 +28,7 @@ RUN --mount=type=cache,target=/root/.cache poetry install | ||||
| FROM d3lta-prod AS d3lta-dev | ||||
| 
 | ||||
| RUN --mount=type=cache,target=/root/.cache poetry install --with dev | ||||
| 
 | ||||
| # install nektos/act as specified in https://nektosact.com/installation/index.html#bash-script | ||||
| # the -b flag specifies the target directory (cf. https://github.com/nektos/act/blob/61396d8085a9d812cebf94fa954f5938d48bf2b9/install.sh#L13) | ||||
| RUN curl --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash -s -- -b /usr/bin | ||||
|  | ||||
| @ -4,6 +4,14 @@ | ||||
| 	"name": "Python 3", | ||||
| 	"dockerFile": "./Dockerfile", | ||||
| 	"context": "..", | ||||
| 	// Features to add to the dev container. More info: https://containers.dev/features. | ||||
| 	// "features": {} | ||||
| 	"customizations": { | ||||
| 		"vscode": { | ||||
| 			"extensions": [ | ||||
| 				"github.vscode-github-actions" | ||||
| 			] | ||||
| 		} | ||||
| 	}, | ||||
| 	"features": { | ||||
| 		"ghcr.io/devcontainers/features/docker-in-docker:2": {} | ||||
| 	} | ||||
| } | ||||
							
								
								
									
										156
									
								
								.github/workflows/publish-to-pypi.yml
									
									
									
									
										поставляемый
									
									
								
							
							
						
						
									
										156
									
								
								.github/workflows/publish-to-pypi.yml
									
									
									
									
										поставляемый
									
									
								
							| @ -1,63 +1,69 @@ | ||||
| # derived from https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/#the-whole-ci-cd-workflow | ||||
| name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI | ||||
| name: Publish Python distribution to PyPI | ||||
| 
 | ||||
| on: push | ||||
| on: | ||||
|   release: | ||||
|     types: [published] | ||||
| 
 | ||||
| env: | ||||
|   ACT: false  # env.ACT == true when running inside nektos/act | ||||
| 
 | ||||
| jobs: | ||||
|   unit-test: | ||||
|     name: Run unit tests 🤾 | ||||
|     runs-on: ubuntu-latest | ||||
| 
 | ||||
|     steps: | ||||
|     - uses: actions/checkout@v4 | ||||
|       with: | ||||
|         persist-credentials: false | ||||
|     - name: Set up Python | ||||
|       uses: actions/setup-python@v5 | ||||
|       with: | ||||
|         python-version: "3.11" | ||||
|     - name: Install dependencies | ||||
|       run: | | ||||
|         pip install poetry | ||||
|         poetry install --with dev  | ||||
|     - name: Build a binary wheel and a source tarball | ||||
|       run:  | ||||
|         poetry run pytest | ||||
| 
 | ||||
|   build: | ||||
|     name: Build distribution 📦 | ||||
|     needs: | ||||
|     - unit-test | ||||
|     name: Build distribution | ||||
|     # based on https://stackoverflow.com/a/74318141 | ||||
|     if: ${{ github.event.release.target_commitish == 'main'}} | ||||
|     runs-on: ubuntu-latest | ||||
| 
 | ||||
|     steps: | ||||
|     - uses: actions/checkout@v4 | ||||
|       with: | ||||
|         persist-credentials: false | ||||
| 
 | ||||
|     - name: Set up Python | ||||
|       uses: actions/setup-python@v5 | ||||
|       with: | ||||
|         python-version: "3.x" | ||||
|     - name: Install pypa/build | ||||
|       run: >- | ||||
|         python3 -m | ||||
|         pip install | ||||
|         build | ||||
|         --user | ||||
| 
 | ||||
|     - name: Build a binary wheel and a source tarball | ||||
|       run: python3 -m build | ||||
|       run: pipx run build | ||||
| 
 | ||||
|     - name: Store the distribution packages | ||||
|       uses: actions/upload-artifact@v4 | ||||
|       with: | ||||
|         name: python-package-distributions | ||||
|         name: distfiles | ||||
|         path: dist/ | ||||
|         if-no-files-found: error | ||||
| 
 | ||||
|   # taken from https://github.com/python-poetry/poetry/blob/b580e8aa4fbce53569420e7b42568dfd9e73519f/.github/workflows/release.yaml | ||||
|   upload-built-distribution-to-github-release: | ||||
|     name: Upload (GitHub) | ||||
|     runs-on: ubuntu-latest | ||||
|     permissions: | ||||
|       contents: write | ||||
|     needs: build | ||||
|     steps: | ||||
|       # Checking-out the project since the gh CLI expects to be called in the context of a git repository. | ||||
|       - uses: actions/checkout@v4 | ||||
|         with: | ||||
|           persist-credentials: false | ||||
| 
 | ||||
|       - name: Retrieve built distribution | ||||
|         uses: actions/download-artifact@v4 | ||||
|         with: | ||||
|           name: distfiles | ||||
|           path: dist/ | ||||
| 
 | ||||
|       - run: gh release upload "${TAG_NAME}" dist/*.{tar.gz,whl} | ||||
|         # skip step when debugging locally via nektos/act | ||||
|         if: ${{ !env.ACT }} | ||||
|         env: | ||||
|           GH_TOKEN: ${{ github.token }} | ||||
|           TAG_NAME: ${{ github.event.release.tag_name }} | ||||
| 
 | ||||
|   publish-to-pypi: | ||||
|     name: >- | ||||
|       Publish Python 🐍 distribution 📦 to PyPI | ||||
|     if: startsWith(github.ref, 'refs/tags/')  # only publish to PyPI on tag pushes | ||||
|     needs: | ||||
|     - build | ||||
|     name: Publish Python distribution to PyPI | ||||
|     needs: build | ||||
|     runs-on: ubuntu-latest | ||||
|     environment: | ||||
|       name: pypi | ||||
| @ -66,77 +72,15 @@ jobs: | ||||
|       id-token: write  # IMPORTANT: mandatory for trusted publishing | ||||
| 
 | ||||
|     steps: | ||||
|     - name: Download all the dists | ||||
|       - name: Retrieve built distribution | ||||
|         uses: actions/download-artifact@v4 | ||||
|         with: | ||||
|         name: python-package-distributions | ||||
|           name: distfiles | ||||
|           path: dist/ | ||||
|     - name: Publish distribution 📦 to PyPI | ||||
|       uses: pypa/gh-action-pypi-publish@release/v1 | ||||
| 
 | ||||
|   github-release: | ||||
|     name: >- | ||||
|       Sign the Python 🐍 distribution 📦 with Sigstore | ||||
|       and upload them to GitHub Release | ||||
|     needs: | ||||
|     - publish-to-pypi | ||||
|     runs-on: ubuntu-latest | ||||
| 
 | ||||
|     permissions: | ||||
|       contents: write  # IMPORTANT: mandatory for making GitHub Releases | ||||
|       id-token: write  # IMPORTANT: mandatory for sigstore | ||||
| 
 | ||||
|     steps: | ||||
|     - name: Download all the dists | ||||
|       uses: actions/download-artifact@v4 | ||||
|       with: | ||||
|         name: python-package-distributions | ||||
|         path: dist/ | ||||
|     - name: Sign the dists with Sigstore | ||||
|       uses: sigstore/gh-action-sigstore-python@v3.0.0 | ||||
|       with: | ||||
|         inputs: >- | ||||
|           ./dist/*.tar.gz | ||||
|           ./dist/*.whl | ||||
|     - name: Create GitHub Release | ||||
|       env: | ||||
|         GITHUB_TOKEN: ${{ github.token }} | ||||
|       run: >- | ||||
|         gh release create | ||||
|         "$GITHUB_REF_NAME" | ||||
|         --repo "$GITHUB_REPOSITORY" | ||||
|         --notes "" | ||||
|     - name: Upload artifact signatures to GitHub Release | ||||
|       env: | ||||
|         GITHUB_TOKEN: ${{ github.token }} | ||||
|       # Upload to GitHub Release using the `gh` CLI. | ||||
|       # `dist/` contains the built packages, and the | ||||
|       # sigstore-produced signatures and certificates. | ||||
|       run: >- | ||||
|         gh release upload | ||||
|         "$GITHUB_REF_NAME" dist/** | ||||
|         --repo "$GITHUB_REPOSITORY" | ||||
| 
 | ||||
|   publish-to-testpypi: | ||||
|     name: Publish Python 🐍 distribution 📦 to TestPyPI | ||||
|     needs: | ||||
|     - build | ||||
|     runs-on: ubuntu-latest | ||||
| 
 | ||||
|     environment: | ||||
|       name: testpypi | ||||
|       url: https://test.pypi.org/p/d3lta  # pypi is case insensitive so d3lta == D3lta | ||||
| 
 | ||||
|     permissions: | ||||
|       id-token: write  # IMPORTANT: mandatory for trusted publishing | ||||
| 
 | ||||
|     steps: | ||||
|     - name: Download all the dists | ||||
|       uses: actions/download-artifact@v4 | ||||
|       with: | ||||
|         name: python-package-distributions | ||||
|         path: dist/ | ||||
|     - name: Publish distribution 📦 to TestPyPI | ||||
|       - name: Publish distribution to PyPI | ||||
|         # skip step when debugging locally via nektos/act | ||||
|         if: ${{ !env.ACT }} | ||||
|         uses: pypa/gh-action-pypi-publish@release/v1 | ||||
|         with: | ||||
|         repository-url: https://test.pypi.org/legacy/ | ||||
|           print-hash: true | ||||
|  | ||||
							
								
								
									
										28
									
								
								.github/workflows/test.yml
									
									
									
									
										поставляемый
									
									
										Обычный файл
									
								
							
							
						
						
									
										28
									
								
								.github/workflows/test.yml
									
									
									
									
										поставляемый
									
									
										Обычный файл
									
								
							| @ -0,0 +1,28 @@ | ||||
| name: Run tests | ||||
| 
 | ||||
| on: push | ||||
| 
 | ||||
| jobs: | ||||
|   unit-test: | ||||
|     name: Run unit tests | ||||
|     runs-on: ubuntu-latest | ||||
| 
 | ||||
|     steps: | ||||
|     - uses: actions/checkout@v4 | ||||
|       with: | ||||
|         persist-credentials: false | ||||
| 
 | ||||
|     - name: Install poetry | ||||
|       run: pipx install poetry | ||||
| 
 | ||||
|     - name: Set up Python | ||||
|       uses: actions/setup-python@v5 | ||||
|       with: | ||||
|         python-version: "3.11" | ||||
|         cache: poetry | ||||
| 
 | ||||
|     - name: Install dependencies | ||||
|       run: poetry install --with dev | ||||
| 
 | ||||
|     - name: Run tests | ||||
|       run: poetry run pytest | ||||
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										поставляемый
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										поставляемый
									
									
								
							| @ -229,3 +229,4 @@ pyrightconfig.json | ||||
| use_model_kaggle/ | ||||
| include/ | ||||
| .benchmarks | ||||
| .act-event.json | ||||
							
								
								
									
										60
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										60
									
								
								README.md
									
									
									
									
									
								
							| @ -1,43 +1,22 @@ | ||||
| <h2 align="center"> <a href="https://arxiv.org/abs/2312.17338">D3lta</a></h2> | ||||
| 
 | ||||
| <h5 align="center">  | ||||
| 
 | ||||
| If you like our project, please give us a star ⭐ on GitHub for the latest update.  </h2> | ||||
| 
 | ||||
| </h5> | ||||
| 
 | ||||
| <div align=center> | ||||
| # D3lta | ||||
| 
 | ||||
| [](https://pypi.org/project/d3lta/) | ||||
| [](https://arxiv.org/abs/2312.17338) | ||||
| 
 | ||||
| This repository is the official implementation of D3lta, a library for detecting duplicate verbatim contents within a vast amount of documents. | ||||
| 
 | ||||
| It distinguishes 3 types of duplicate contents : copypasta (almost exact duplicates), rewording and translation. You can run it on CPU. | ||||
| </div> | ||||
| 
 | ||||
| --- | ||||
| 
 | ||||
| <img style="display: block; margin: auto;" src="https://github.com/VIGINUM-FR/D3lta/raw/main/static/graph.gif"/> | ||||
| 
 | ||||
| 
 | ||||
| ## 💻 Installing  | ||||
| 
 | ||||
| Clone the repository | ||||
| ## 💻 Installation | ||||
| 
 | ||||
| ```bash | ||||
| git clone https://github.com/VIGINUM-FR/D3lta | ||||
| ``` | ||||
| 
 | ||||
| Navigate to the project | ||||
| 
 | ||||
| ```bash | ||||
| cd D3lta | ||||
| ``` | ||||
| 
 | ||||
| Install the package | ||||
| 
 | ||||
| ```bash | ||||
| pip install -e . | ||||
| # PyPI is case insensitive, so d3lta == D3lta | ||||
| pip install d3lta | ||||
| ``` | ||||
| 
 | ||||
| ## 🚀 Quick start | ||||
| @ -163,11 +142,10 @@ matches, df_clusters = semantic_faiss( | ||||
| matches | ||||
| ``` | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| ## 📚 Synthetic dataset | ||||
| 
 | ||||
| The dataset is available in the release `1.0.0`. It contains the following files: | ||||
| The dataset is available in the [`1.0.0` release](https://github.com/VIGINUM-FR/D3lta/releases/tag/1.0.0). | ||||
| It contains the following files: | ||||
| 
 | ||||
| ### `synthetic_dataset_documents.csv`: | ||||
| 
 | ||||
| @ -205,10 +183,30 @@ Column details: | ||||
| 
 | ||||
| ## Notebooks | ||||
| 
 | ||||
| In folder the [`notebooks`](./notebooks/), you can find:  | ||||
| - [`example_synthetic_dataset.ipynb`](./notebooks/example_synthetic_dataset.ipynb): Example of applying threedelta methodology to the synthetic dataset, with a comparison to the true labels. | ||||
| In the [`notebooks`](./notebooks/) directory, you can find:  | ||||
| - [`example_synthetic_dataset.ipynb`](./notebooks/example_synthetic_dataset.ipynb): example of applying the D3lta methodology to the synthetic dataset, with a comparison to the true labels. | ||||
| 
 | ||||
| 
 | ||||
| ## 👩💻 Developing | ||||
| 
 | ||||
| Clone the repository | ||||
| 
 | ||||
| ```bash | ||||
| git clone https://github.com/VIGINUM-FR/D3lta | ||||
| ``` | ||||
| 
 | ||||
| Navigate to the project | ||||
| 
 | ||||
| ```bash | ||||
| cd D3lta | ||||
| ``` | ||||
| 
 | ||||
| Install the package | ||||
| 
 | ||||
| ```bash | ||||
| pip install -e . | ||||
| ``` | ||||
| 
 | ||||
| ## Citation | ||||
| 
 | ||||
| If you find our paper and code useful in your research, please consider giving a star 🌟  and a citation 📝: | ||||
|  | ||||
| @ -3,8 +3,6 @@ from abc import ABC, abstractmethod | ||||
| from dataclasses import dataclass | ||||
| from typing import final | ||||
| 
 | ||||
| import demoji | ||||
| 
 | ||||
| 
 | ||||
| @dataclass | ||||
| class EmojisRemover(ABC): | ||||
| @ -96,8 +94,3 @@ class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover): | ||||
| 
 | ||||
|     def _remove_symbols_implementation(self, text: str) -> str: | ||||
|         return self.SYMBOLS_REGEX.sub(r"", text) | ||||
| 
 | ||||
| 
 | ||||
| class DemojiEmojisRemover(EmojisRemover): | ||||
|     def _remove_symbols_implementation(self, text: str) -> str: | ||||
|         return demoji.replace(text) | ||||
|  | ||||
							
								
								
									
										75
									
								
								poetry.lock
									
									
									
										сгенерированный
									
									
									
								
							
							
						
						
									
										75
									
								
								poetry.lock
									
									
									
										сгенерированный
									
									
									
								
							| @ -155,21 +155,6 @@ files = [ | ||||
| ] | ||||
| markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""} | ||||
| 
 | ||||
| [[package]] | ||||
| name = "demoji" | ||||
| version = "1.1.0" | ||||
| description = "Accurately remove and replace emojis in text strings" | ||||
| optional = false | ||||
| python-versions = ">=3.6" | ||||
| groups = ["main"] | ||||
| files = [ | ||||
|     {file = "demoji-1.1.0-py3-none-any.whl", hash = "sha256:6d3256c909aea299e97fe984f827a2a060c2a8f8bfcbafa7ec9659967c5df50f"}, | ||||
|     {file = "demoji-1.1.0.tar.gz", hash = "sha256:072efaeca725e6f63ab59d83abeb55b178842538ed9256455a82ebbd055ff216"}, | ||||
| ] | ||||
| 
 | ||||
| [package.extras] | ||||
| ujson = ["ujson"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "exceptiongroup" | ||||
| version = "1.3.0" | ||||
| @ -1060,18 +1045,6 @@ files = [ | ||||
|     {file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"}, | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "py-cpuinfo" | ||||
| version = "9.0.0" | ||||
| description = "Get CPU info with pure Python" | ||||
| optional = false | ||||
| python-versions = "*" | ||||
| groups = ["dev"] | ||||
| files = [ | ||||
|     {file = "py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690"}, | ||||
|     {file = "py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5"}, | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "pybind11" | ||||
| version = "2.13.6" | ||||
| @ -1125,27 +1098,6 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""} | ||||
| [package.extras] | ||||
| dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "pytest-benchmark" | ||||
| version = "5.1.0" | ||||
| description = "A ``pytest`` fixture for benchmarking code. It will group the tests into rounds that are calibrated to the chosen timer." | ||||
| optional = false | ||||
| python-versions = ">=3.9" | ||||
| groups = ["dev"] | ||||
| files = [ | ||||
|     {file = "pytest-benchmark-5.1.0.tar.gz", hash = "sha256:9ea661cdc292e8231f7cd4c10b0319e56a2118e2c09d9f50e1b3d150d2aca105"}, | ||||
|     {file = "pytest_benchmark-5.1.0-py3-none-any.whl", hash = "sha256:922de2dfa3033c227c96da942d1878191afa135a29485fb942e85dff1c592c89"}, | ||||
| ] | ||||
| 
 | ||||
| [package.dependencies] | ||||
| py-cpuinfo = "*" | ||||
| pytest = ">=8.1" | ||||
| 
 | ||||
| [package.extras] | ||||
| aspect = ["aspectlib"] | ||||
| elasticsearch = ["elasticsearch"] | ||||
| histogram = ["pygal", "pygaljs", "setuptools"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "python-dateutil" | ||||
| version = "2.9.0.post0" | ||||
| @ -1291,6 +1243,28 @@ files = [ | ||||
|     {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "smart-open" | ||||
| version = "5.1.0" | ||||
| description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)" | ||||
| optional = false | ||||
| python-versions = ">=3.6.*" | ||||
| groups = ["main"] | ||||
| markers = "python_version >= \"3.12\"" | ||||
| files = [ | ||||
|     {file = "smart_open-5.1.0-py3-none-any.whl", hash = "sha256:2059b07f530c8c9e2158e4e1575309aacb74bd813da2325c1f348015d04f3bd6"}, | ||||
|     {file = "smart_open-5.1.0.tar.gz", hash = "sha256:e4dc1350b240ef0759e343e4e2f361bfd4e5477bb2619866e97f80240652e92e"}, | ||||
| ] | ||||
| 
 | ||||
| [package.extras] | ||||
| all = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage", "requests"] | ||||
| azure = ["azure-common", "azure-core", "azure-storage-blob"] | ||||
| gcs = ["google-cloud-storage"] | ||||
| http = ["requests"] | ||||
| s3 = ["boto3"] | ||||
| test = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage", "moto[server] (==1.3.14)", "parameterizedtestcase", "paramiko", "pathlib2", "pytest", "pytest-rerunfailures", "requests", "responses"] | ||||
| webhdfs = ["requests"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "smart-open" | ||||
| version = "7.1.0" | ||||
| @ -1298,6 +1272,7 @@ description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storag | ||||
| optional = false | ||||
| python-versions = "<4.0,>=3.7" | ||||
| groups = ["main"] | ||||
| markers = "python_version <= \"3.11\"" | ||||
| files = [ | ||||
|     {file = "smart_open-7.1.0-py3-none-any.whl", hash = "sha256:4b8489bb6058196258bafe901730c7db0dcf4f083f316e97269c66f45502055b"}, | ||||
|     {file = "smart_open-7.1.0.tar.gz", hash = "sha256:a4f09f84f0f6d3637c6543aca7b5487438877a21360e7368ccf1f704789752ba"}, | ||||
| @ -1747,5 +1722,5 @@ files = [ | ||||
| 
 | ||||
| [metadata] | ||||
| lock-version = "2.1" | ||||
| python-versions = "^3.10" | ||||
| content-hash = "2a469cf6cd729d58a4315152a037a242fdc09dba63fe3adfe00bbb88c3f16863" | ||||
| python-versions = ">=3.10" | ||||
| content-hash = "3315c4aedc40f50a78569149ca711d514b9a06b30c9c3b5a6f1402e5abf7e032" | ||||
|  | ||||
| @ -1,32 +1,32 @@ | ||||
| [tool.poetry] | ||||
| [project] | ||||
| name = "d3lta" | ||||
| version = "1.0.1" | ||||
| version = "1.0.2" | ||||
| description = "A library for detecting verbatim-duplicated contents within a vast amount of documents" | ||||
| readme = "README.md" | ||||
| authors = ["Viginum"] | ||||
| authors = [{ name = "VIGINUM" }] | ||||
| license = { file = "LICENSE.txt" } | ||||
| requires-python = ">=3.10" | ||||
| dependencies = [ | ||||
|     "faiss-cpu==1.9.0.post1", | ||||
|     "fasttext==0.9.3", | ||||
|     "gensim==4.3.3", | ||||
|     "networkx==2.8.8", | ||||
|     "pandas==2.2.3", | ||||
|     "polyleven==0.8", | ||||
|     "scipy==1.12.0", | ||||
|     "tensorflow==2.18.0", | ||||
|     "tensorflow-hub==0.16.1", | ||||
|     "tensorflow-text==2.18.1", | ||||
|     "tqdm==4.67.1", | ||||
| ] | ||||
| 
 | ||||
| [tool.poetry.dependencies] | ||||
| python = "^3.10" | ||||
| demoji = "^1.1.0" | ||||
| faiss-cpu = "1.9.0.post1" | ||||
| fasttext = "0.9.3" | ||||
| gensim = "4.3.3" | ||||
| networkx = "2.8.8" | ||||
| pandas = "2.2.3" | ||||
| polyleven = "0.8" | ||||
| scipy = "1.12.0" | ||||
| tensorflow = "2.18.0" | ||||
| tensorflow-hub = "0.16.1" | ||||
| tensorflow-text = "2.18.1" | ||||
| tqdm = "4.67.1" | ||||
| 
 | ||||
| [tool.poetry.group.dev] | ||||
| optional = true | ||||
| 
 | ||||
| [tool.poetry.group.dev.dependencies] | ||||
| pytest = "^8.3.5" | ||||
| pytest-benchmark = "^5.1.0" | ||||
| 
 | ||||
| [build-system] | ||||
| requires = ["setuptools", "poetry-core"] | ||||
| requires = ["poetry-core"] | ||||
| build-backend = "poetry.core.masonry.api" | ||||
|  | ||||
							
								
								
									
										4
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								setup.py
									
									
									
									
									
								
							| @ -1,4 +0,0 @@ | ||||
| #!/usr/bin/env python | ||||
| from setuptools import setup | ||||
| 
 | ||||
| setup() | ||||
| @ -6,9 +6,6 @@ from get_unicode_emojis_list import ( | ||||
|     EMOJI_TESTFILE_FILENAME, | ||||
|     get_all_emojis_from_latest_unicode_emojis_specification_with_download, | ||||
| ) | ||||
| from pytest_benchmark.fixture import ( | ||||
|     BenchmarkFixture, | ||||
| ) | ||||
| 
 | ||||
| import d3lta.emojis_remover | ||||
| 
 | ||||
| @ -17,13 +14,6 @@ import d3lta.emojis_remover | ||||
|     name="emojis_remover", | ||||
|     params=[ | ||||
|         d3lta.emojis_remover.ExplicitUnicodeBlocksEmojisRemover, | ||||
|         pytest.param( | ||||
|             d3lta.emojis_remover.DemojiEmojisRemover, | ||||
|             marks=pytest.mark.xfail( | ||||
|                 reason="`demoji`'s detection engine does not detect all emojis in the Unicode specification", | ||||
|                 strict=True, | ||||
|             ), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def fixture_emojis_remover( | ||||
| @ -108,11 +98,8 @@ In consequence whereof, the National 🏞️  Assembly 👩🏭👨🏭  r | ||||
| def test_on_text_sample( | ||||
|     emojis_remover: d3lta.emojis_remover.EmojisRemover, | ||||
|     sample_text_with_emojipasta: str, | ||||
|     sample_text: str, | ||||
|     benchmark: BenchmarkFixture, | ||||
|     sample_text: str | ||||
| ): | ||||
|     processed = benchmark( | ||||
|         emojis_remover.remove_symbols, | ||||
|     assert emojis_remover.remove_symbols( | ||||
|         sample_text_with_emojipasta, | ||||
|     ) | ||||
|     assert processed == sample_text | ||||
|     ) == sample_text | ||||
|  | ||||
		Загрузка…
	
	
			
			x
			
			
		
	
		Ссылка в новой задаче
	
	Block a user