Π­Ρ‚ΠΎΡ‚ ΠΊΠΎΠΌΠΌΠΈΡ‚ содСрТится Π²:
Someone 2024-12-19 10:58:10 +01:00
ΠšΠΎΠΌΠΌΠΈΡ‚ 858072c471
10 ΠΈΠ·ΠΌΠ΅Π½Ρ‘Π½Π½Ρ‹Ρ… Ρ„Π°ΠΉΠ»ΠΎΠ²: 112413 Π΄ΠΎΠ±Π°Π²Π»Π΅Π½ΠΈΠΉ ΠΈ 0 ΡƒΠ΄Π°Π»Π΅Π½ΠΈΠΉ

230
.gitignore поставляСмый ΠžΠ±Ρ‹Ρ‡Π½Ρ‹ΠΉ Ρ„Π°ΠΉΠ»
ΠŸΡ€ΠΎΡΠΌΠΎΡ‚Ρ€Π΅Ρ‚ΡŒ Ρ„Π°ΠΉΠ»

@ -0,0 +1,230 @@
# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,git,linux
# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,git,linux
### Git ###
# Created by git for backups. To disable backups in Git:
# $ git config --global mergetool.keepBackup false
*.orig
# Created by git when using merge tools for conflicts
*.BACKUP.*
*.BASE.*
*.LOCAL.*
*.REMOTE.*
*_BACKUP_*.txt
*_BASE_*.txt
*_LOCAL_*.txt
*_REMOTE_*.txt
### Linux ###
*~
# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*
# KDE directory preferences
.directory
# Linux trash folder which might appear on any partition or disk
.Trash-*
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml
# ruff
.ruff_cache/
# LSP config files
pyrightconfig.json
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
# Local History for Visual Studio Code
.history/
# Built Visual Studio Code Extensions
*.vsix
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide
# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,git,linux
*.pkl
*.ftz
use_model_kaggle/
include/

21
LICENSE.txt ΠžΠ±Ρ‹Ρ‡Π½Ρ‹ΠΉ Ρ„Π°ΠΉΠ»
ΠŸΡ€ΠΎΡΠΌΠΎΡ‚Ρ€Π΅Ρ‚ΡŒ Ρ„Π°ΠΉΠ»

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 VIGINUM
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

226
README.md ΠžΠ±Ρ‹Ρ‡Π½Ρ‹ΠΉ Ρ„Π°ΠΉΠ»
ΠŸΡ€ΠΎΡΠΌΠΎΡ‚Ρ€Π΅Ρ‚ΡŒ Ρ„Π°ΠΉΠ»

@ -0,0 +1,226 @@
<h2 align="center"> <a href="https://arxiv.org/abs/2312.17338">D3lta</a></h2>
<h5 align="center">
If you like our project, please give us a star ⭐ on GitHub for the latest update. </h2>
</h5>
<div align=center>
[![arXiv](https://img.shields.io/badge/Arxiv-2312.17338-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2312.17338)
This repository is the official implementation of D3lta, a library for detecting duplicate verbatim contents within a vast amount of documents.
It distinguishes 3 types of duplicate contents : copypasta (almost exact duplicates), rewording and translation. You can run it on CPU.
</div>
---
<img style="display: block; margin: auto;" src="./static/graph.gif"/>
## πŸ’» Installing
Clone the repository
```bash
git clone https://github.com/VIGINUM-FR/D3lta
```
Navigate to the project
```bash
cd D3lta
```
Install the package
```bash
pip install -e .
```
## πŸš€ Quick start
One can use directly `semantic_faiss` function from a Dataframe that contains texts.
We use by default the embeddings from the [Universal Sentence Encoder](https://www.kaggle.com/models/google/universal-sentence-encoder/tensorFlow1/lite/2)
but one can use other models to calculate embeddings.
```python
import pandas as pd
from d3lta.faissd3lta import *
examples_dataset = [
"Je m'apelle Mimie et je fais du stop",
"Je m'apelle Giselle et toi ?",
"Les chats sont gris",
"Cat's are grey, aren't they ?",
"Cats are grey",
"Les chats ne sont pas gris",
]
df = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
df.index = df.index.astype(str)
matches, df_clusters = semantic_faiss(
df=df.rename(columns={"text_language_detect": "original"}),
min_size_txt=10,
embeddings_to_save='myembeddings',
threshold_grapheme=0.693,
threshold_language=0.715,
threshold_semantic=0.85,
)
>>>matches
source target score duplicates language_source text_to_embed_source text_grapheme_source language_target text_to_embed_target text_grapheme_target dup_type score_lev
0 2 3 0.745741 2-3 fr Les chats sont gris leschatssontgris en Cat's are grey, aren't they ? catsaregreyarentthey translation NaN
1 2 4 0.955517 2-4 fr Les chats sont gris leschatssontgris en Cats are grey catsaregrey translation NaN
2 2 5 0.808805 2-5 fr Les chats sont gris leschatssontgris fr Les chats ne sont pas gris leschatsnesontpasgris copy-pasta 0.761905
5 3 5 0.833525 3-5 en Cat's are grey, aren't they ? catsaregreyarentthey fr Les chats ne sont pas gris leschatsnesontpasgris translation NaN
8 4 5 0.767601 4-5 en Cats are grey catsaregrey fr Les chats ne sont pas gris leschatsnesontpasgris translation NaN
>>>df_clusters
original language text_grapheme text_to_embed text_language_detect cluster
0 Je m'apelle Mimie et je fais du stop fr jemapellemimieetjefaisdustop Je m'apelle Mimie et je fais du stop Je m'apelle Mimie et je fais du stop NaN
1 Je m'apelle Giselle et toi ? fr jemapellegiselleettoi Je m'apelle Giselle et toi ? Je m'apelle Giselle et toi ? NaN
2 Les chats sont gris fr leschatssontgris Les chats sont gris Les chats sont gris 0.0
3 Cat's are grey, aren't they ? en catsaregreyarentthey Cat's are grey, aren't they ? Cat's are grey, aren't they ? 0.0
4 Cats are grey en catsaregrey Cats are grey Cats are grey 0.0
5 Les chats ne sont pas gris fr leschatsnesontpasgris Les chats ne sont pas gris Les chats ne sont pas gris 0.0
```
Its also possible to use [Faiss](https://github.com/facebookresearch/faiss) to find similar embeddings.
```python
import pandas as pd
from d3lta.faissd3lta import *
examples_dataset = [
"Je m'apelle Mimie et je fais du stop",
"Je m'apelle Giselle et toi ?",
"Les chats sont gris",
"Cat's are grey, aren't they ?",
"Cats are grey",
"Les chats ne sont pas gris",
]
df_test = pd.DataFrame(
examples_dataset,
columns=["text_to_embed"],
index=range(len(examples_dataset)),
) # index for checking that it has good ids
df_emb = compute_embeddings(df_test)
index_t = create_index_cosine(df_emb)
test_dataset = pd.DataFrame([{"text_to_embed": "I gatti sono grigi"}])
df_emb_test = compute_embeddings(test_dataset)
limits, distances, indices = index_t.range_search(
x=df_emb_test.to_numpy().reshape(1, -1), thresh=0.7
)
>>>df_test.loc[indices]["text_to_embed"]
2 Les chats sont gris
3 Cat's are grey, aren't they ?
4 Cats are grey
5 Les chats ne sont pas gris
Name: text_to_embed, dtype: object
```
It is also possible to use your own embedding (other than Universal Sentence Encoder). For example:
```python
import pandas as pd
from sentence_transformers import SentenceTransformer
from d3lta.faissd3lta import *
examples_dataset = [
"Je m'apelle Mimie et je fais du stop",
"Je m'apelle Giselle et toi ?",
"Les chats sont gris",
"Cat's are grey, aren't they ?",
"Cats are grey",
"Les chats ne sont pas gris",
]
df = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
df.index = df.index.astype(str)
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
new_emb = model.encode(df['text_language_detect'].values.tolist())
df_emb = pd.DataFrame(new_emb, index=df.index)
matches, df_clusters = semantic_faiss(
df=df.rename(columns={"text_language_detect": "original"}),
min_size_txt=10,
df_embeddings_use=df_emb,
threshold_grapheme=0.693,
threshold_language=0.715,
threshold_semantic=0.85,
)
matches
```
## πŸ“š Synthetic dataset
The dataset is available in the release `1.0.0`. It contains the following files:
### `synthetic_dataset_documents.csv`:
This file contains all seeds (real and original texts) and their generated variations (copy-pasta, rewording or translations).
There are 2985 documents in this corpus dataset generated using a large language model.
Columns details:
- doc_id (int): unique number associated to each text. All seed index are multiples of 10 and followed by their 9 transformations.
- original (str): real or transformed text
- text_type (str): dataset where the seed was extracted (`books`, `news`, `tweets`)
- language (str): language of the text
- prompt (str): prompt given to ChatGPT for "copypasta" and "rewording"
- seed (bool): True if the text is one of the 300 initial texts from which the generation is from
The 300 initial texts (seeds) have been taken from three Kaggle datasets :
- https://www.kaggle.com/competitions/nlp-getting-started/data
- https://www.kaggle.com/datasets/abireltaief/books-reviews
- https://www.kaggle.com/datasets/rmisra/news-category-dataset
(For more info, please refer to the [paper](https://arxiv.org/abs/2312.17338))
### `synthetic_dataset_pairs_unbalanced.csv`:
This file contains the 1,497,547 annotated pairs of text of the synthetic dataset : 4,500 pairs of translation, 4,030 pairs of copy-pasta, 4017 pairs of rewording and 1,485,000 pairs of non duplicated content called "nomatch".
Column details:
- source_target (str): unique id for the pair.
- source (int): index of the first text of the pair in the synthetic_dataset_documents.csv
- target (int): index of the second text of the pair in the synthetic_dataset_documents.csv
- original_source (str): text of the source index
- original_target (str): text of the target index
- language_source (str): language of original_source
- language_target (str): language of original_target
- true_label (str): transformation relation that links both text of the pair i.e. the source and target texts are {true_label} of each other. The true_label can be "copypasta", "rewording" or "translation"
## Notebooks
In folder the [`notebooks`](./notebooks/), you can find:
- [`example_synthetic_dataset.ipynb`](./notebooks/example_synthetic_dataset.ipynb): Example of applying threedelta methodology to the synthetic dataset, with a comparison to the true labels.
## Citation
If you find our paper and code useful in your research, please consider giving a star 🌟 and a citation πŸ“:
```BibTeX
@misc{richard2023unmasking,
title={Unmasking information manipulation: A quantitative approach to detecting Copy-pasta, Rewording, and Translation on Social Media},
author={Manon Richard and Lisa Giordani and Cristian Brokate and Jean LiΓ©nard},
year={2023},
eprint={2312.17338},
archivePrefix={arXiv},
primaryClass={cs.SI},
url={https://arxiv.org/abs/2312.17338},
}
```

0
d3lta/__init__.py ΠžΠ±Ρ‹Ρ‡Π½Ρ‹ΠΉ Ρ„Π°ΠΉΠ»
ΠŸΡ€ΠΎΡΠΌΠΎΡ‚Ρ€Π΅Ρ‚ΡŒ Ρ„Π°ΠΉΠ»

575
d3lta/faissd3lta.py ΠžΠ±Ρ‹Ρ‡Π½Ρ‹ΠΉ Ρ„Π°ΠΉΠ»
ΠŸΡ€ΠΎΡΠΌΠΎΡ‚Ρ€Π΅Ρ‚ΡŒ Ρ„Π°ΠΉΠ»

@ -0,0 +1,575 @@
from functools import wraps
import os
import re
import time
from typing import Union
import demoji
import faiss
import fasttext
from gensim.utils import deaccent
import networkx as nx
import numpy as np
import pandas as pd
from polyleven import levenshtein
import requests
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
from tqdm.contrib.concurrent import thread_map
from tqdm.auto import trange
import networkx as nx
def timeit(func):
@wraps(func)
def timeit_wrapper(*args, **kwargs):
start_time = time.time()
print(f">>> Start {func.__name__}")
result = func(*args, **kwargs)
end_time = time.time()
total_time = end_time - start_time
if total_time < 60:
print(f"<<< End {func.__name__}, Took: {total_time:.4f} sec")
else:
print(f"<<< End {func.__name__}, Took:{np.round((total_time)/60, 1)} min")
return result
return timeit_wrapper
def grouper(iterable, n):
"""A (lazy) iterator that chunks `iterable` into lists of `n`"""
for i in range(0, len(iterable), n):
yield iterable[i : i + n]
###############################
#### Preprocessing Dataset ####
###############################
def preprocess_text(
s,
lower=True,
remove_accents=True,
remove_urls=True,
remove_mentions=True,
remove_emojis=True,
remove_hashtags_frontend=False,
remove_twitter_cropend=False,
replace_newline_characters=True,
remove_punctuation=False,
):
"""
clean a list-like of strings, performing all the following treatments by default
Args:
s (list-like of strings): input list-like of strings
lower (bool, optional): lowercase the text. Defaults to True.
remove_accents (bool, optional): deaccent the text. Defaults to True.
remove_urls (bool, optional): remove urls from the text. Defaults to True.
remove_mentions (bool, optional): remove mentions from the text. Defaults to True.
remove_emojis (bool, optional): remove emojis from the text. Defaults to True.
remove_hashtags_frontend (bool, optional): remove leading and ending hashtags from the text. Defaults to False.
remove_twitter_cropend (bool, optional): remove Twitter-added "…" character at the end of messages that are too long. Defaults to False.
replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True.
remove_punctuation (bool, optional): remove punctuation from the text, be careful, it will remove # of hashtags too. Defaults to False.
"""
if s is None:
s = ""
assert isinstance(s, (str, list, pd.Series, set, frozenset))
if isinstance(s, str):
encapsulated = True
s = [s]
else:
encapsulated = False
if lower:
s = [msg.lower() for msg in s]
if remove_accents:
s = [deaccent(msg) for msg in s]
if remove_urls:
match_url_regexp = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
s = [re.sub(match_url_regexp, "", msg, flags=re.MULTILINE).strip() for msg in s]
if remove_mentions:
match_mentions_regexp = r"(@[a-zA-Z0-9_]+)"
s = [
re.sub(match_mentions_regexp, "", msg, flags=re.MULTILINE).strip()
for msg in s
]
if remove_twitter_cropend:
match_croppedmsg_regexp = r"([^\s]+…)$"
s = [
re.sub(match_croppedmsg_regexp, "", msg, flags=re.MULTILINE).strip()
for msg in s
]
if remove_emojis:
s = [demoji.replace(msg, "").strip() for msg in s]
if remove_hashtags_frontend:
if (not remove_urls) or (not remove_mentions):
print(
"Not all leading and ending hashtags might be removed because there might be mentions or urls"
)
match_hashtags_begin = r"(#\S+ ?)+"
match_hashtags_end = r"(\S+# ?)+"
match_hashtags_frontend = f"^({match_hashtags_begin})|^({match_hashtags_end})|({match_hashtags_begin})$|({match_hashtags_end})$"
s = [re.sub(match_hashtags_frontend, "", msg).strip() for msg in s]
if replace_newline_characters:
match_escapes_regexp = r"(\n|\r)+"
s = [
re.sub(
r"\s+", " ", re.sub(match_escapes_regexp, ". ", msg, flags=re.MULTILINE)
).strip()
for msg in s
]
if remove_punctuation:
match_punctuations = r"[^\w\s]"
s = [
re.sub(r"\s+", " ", re.sub(match_punctuations, " ", msg)).strip()
for msg in s
]
if encapsulated:
return s[0].strip()
else:
return s
@timeit
def prepare_dataset(dataset: Union[pd.Series, pd.DataFrame], min_size_txt: int = 30):
"""
Create new columns of preprocessed texts from original text for distance comparison with 3 delta method
Args:
dataset (Union[pd.Series, pd.DataFrame]): dataframe or series containing a column "original" with the text. Optional: a column "language" can be given, otherwise language detection is implemented.
min_size_txt (Optional[int], optional): size of text that should'nt taken into account for duplicate content because too small. If set to None, no text is removed. Defaults to {default_min_size}.
Returns:
dataset (pd.DataFrame): The same input dataset with new columns added (text_grapheme, text_to_embed, text_language_detect), containing the preprocessed texts for 3 delta method.
"""
assert isinstance(
dataset, (pd.Series, pd.DataFrame)
), "dataset must be a pd.Series or a pd.DataFrame"
assert dataset.index.nunique() == len(
dataset
), "dataset must be indexed with unique indices"
assert all(
[isinstance(i, str) for i in dataset.index]
), "dataset indices must be `str`"
if isinstance(dataset, pd.DataFrame):
assert (
"original" in dataset.columns
), "when dataset is a pd.DataFrame, it must have a column named 'original'"
if isinstance(dataset, pd.Series):
dataset = dataset.to_frame("original")
# text_grapheme is used for grapheme distance (Levenshtein)
# this is the cleanest version with no spaces
if "text_grapheme" not in dataset.columns:
dataset["text_grapheme"] = [
t.replace(" ", "")
for t in preprocess_text(
dataset["original"],
lower=True,
remove_accents=True,
remove_urls=True,
remove_mentions=True,
remove_emojis=True,
remove_hashtags_frontend=True,
remove_twitter_cropend=False,
replace_newline_characters=True,
remove_punctuation=True,
)
]
# text_to_embed is used for semantic distance and embedded with USE
# links are removed
if "text_to_embed" not in dataset.columns:
dataset["text_to_embed"] = preprocess_text(
dataset["original"],
lower=False,
remove_accents=False,
remove_urls=True,
remove_mentions=True,
remove_emojis=False,
remove_hashtags_frontend=False,
remove_twitter_cropend=False,
replace_newline_characters=False,
remove_punctuation=False,
)
# text_language_detect is used for fasttext
# accents and emojis are kept as they provide interesting cues to language
if ("language" not in dataset.columns) or (
"text_language_detect" not in dataset.columns
):
dataset["text_language_detect"] = preprocess_text(
dataset["original"],
lower=False,
remove_accents=False,
remove_urls=True,
remove_mentions=True,
remove_emojis=True,
remove_hashtags_frontend=True,
remove_twitter_cropend=False,
replace_newline_characters=True,
remove_punctuation=False,
)
print("Done.")
print("")
if min_size_txt is not None:
print(
f'Removing {(dataset["text_grapheme"].str.len() < min_size_txt).sum()} short texts over {len(dataset)} sentences...'
)
dataset = dataset.loc[dataset["text_grapheme"].str.len() >= min_size_txt]
print("Done.")
return dataset
@timeit
def compute_language(
dataset: pd.DataFrame,
fasttext_model=None,
batch_size: int = 100,
max_workers: int = 8,
):
"""
Compute language detection in order to detect translation
Args :
dataset (pd.DataFrame): dataframe containing the column "text_language_detect" with the text to be analyzed
fasttext_model (Optional[any], optional): optional, if another model than fasttext is to be used, otherwise, fasttext is uploaded. Defaults to None.
batch_size (int, optional): batch size of text to be retrieved each step by parallelization. Defaults to 100.
max_workers (int, optional): number of workers for parallelization. Defaults to 8.
Returns:
dataset (pd.DataFrame): The same input dataset with column 'language' added containing the results of language detection.
"""
assert (
"text_language_detect" in dataset.columns
), "you need to have a column text_language_detect to detect language"
if fasttext_model is None:
if os.path.exists("lid.176.ftz"):
print("Loading fastext model from local file...")
fasttext_model = fasttext.load_model("lid.176.ftz")
else:
print("Downloading fastext model from website and saving locally...")
r = requests.get(
"https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
)
with open("lid.176.ftz", "wb") as f:
f.write(r.content)
fasttext_model = fasttext.load_model("lid.176.ftz")
print("Done.\n")
def process_chunk_fasttext(text_chunk, threshold=0.5):
preds = fasttext_model.predict(text_chunk.tolist(), k=1)
preds = [
lng[0][-2:] if score[0] > threshold else ""
for lng, score in zip(preds[0], preds[1])
]
return preds
batch_size = batch_size
chunk_fasttext = thread_map(
process_chunk_fasttext,
grouper(dataset["text_language_detect"], batch_size),
max_workers=max_workers,
total=len(dataset) // batch_size,
)
dataset["language"] = np.concatenate(chunk_fasttext)
return dataset
#############################
#### Compute Embeddings ####
#############################
def download_USE(
use_url="https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3",
):
use_model = hub.load(use_url)
tf.saved_model.save(use_model, "use_model_kaggle")
return use_model
@timeit
def compute_embeddings(df, batch_size: int = 100, max_workers: int = 8):
"""
Compute embeddings for distance comparison
Args:
df (pd.DataFrame): dataframe containing the column "text_to_embed" with the text to be embedded
batch_size (int, optional): batch size of text to be retrieved each step by parallelization. Defaults to 100.
max_workers (int, optional): number of workers for parallelization. Defaults to 8.
Returns:
dataset (pd.DataFrame): A dataset with new columns added containing the results of embeddings computation.
"""
assert "text_to_embed" in df.columns, print(
"You need to compute text_to_embed columns"
)
use_model = download_USE()
def process_chunk_use(text_chunk):
return pd.DataFrame(
use_model(text_chunk).numpy(),
index=text_chunk.index,
columns=[f"USE:{i}" for i in range(512)],
)
batch_size = batch_size
chunk_use = thread_map(
process_chunk_use,
grouper(df["text_to_embed"], batch_size),
max_workers=max_workers,
total=len(df) // batch_size,
)
dataset = pd.concat([pd.concat(chunk_use, axis=0)], axis=1)
dataset.index = df.index
return dataset
@timeit
def create_index_cosine(df_embeddings: pd.DataFrame):
""" "
Create index with faiss for faster cosine distance computation
Args:
df_embeddings (pd.DataFrame): dataframe containing the embeddings
Returns:
index: A faiss index which can be used to compute cosine distances more efficiently
"""
embeddings = df_embeddings.to_numpy()
ids = list(df_embeddings.index)
# cosine similarity index...
vector_dimension = embeddings.shape[1]
index_flat = faiss.IndexFlat(vector_dimension, faiss.METRIC_INNER_PRODUCT)
# ...encapsulated in another index in order to have posts ids
index = faiss.IndexIDMap(index_flat)
# for cosine similarity, need of normalisation
try:
faiss.normalize_L2(embeddings)
except:
embeddings = embeddings.copy(order="C")
faiss.normalize_L2(embeddings)
print("C contiguous problem solved")
# add embeddings & ids
index.add_with_ids(embeddings, ids)
return index
@timeit
def find_matches(
df_embeddings_search: pd.DataFrame,
index,
threshold: float = 0.7,
batch_size: int = 100,
verbose=True,
):
"""
Compute pairwise cosine similarity between all docs in index between a subset of docs and all docs in index
Args :
df_embeddings_search (pd.DataFrame): dataframe containing embeddings we want to find similarity with in the faiss index
index: faiss index
threshold (float, optional): threshold for similarity. Defaults to 0.7.
batch_size (int, optional): number of vector per batch. Defaults to 100.
Returns :
matches (pd.DataFrame): A dataframe of pairs of duplicated texts with cosine score associated.
"""
list_indices = []
for i_batch in trange(
0, len(df_embeddings_search), batch_size, disable=not verbose
):
limits, distances, indices = index.range_search(
df_embeddings_search.iloc[i_batch : i_batch + batch_size].to_numpy(),
thresh=threshold,
)
for lim in range(len(limits) - 1):
source = df_embeddings_search.index[i_batch + lim]
for target, score in zip(
indices[limits[lim] : limits[lim + 1]],
distances[limits[lim] : limits[lim + 1]],
):
if str(target) != str(source): # doesn't match with its own embedding
list_indices.append([str(source), str(target), score])
# create matches dataframe
matches = pd.DataFrame(list_indices, columns=["source", "target", "score"])
# drop duplicates because we have A-B and B-A
matches["duplicates"] = matches.apply(
lambda row: str(min(row["source"], row["target"]))
+ "-"
+ str(max(row["source"], row["target"])),
axis=1,
)
matches = matches.drop_duplicates("duplicates")
return matches
def similarity_levenshtein(pair):
s1, s2 = pair
assert (
min(len(s1), len(s2)) > 0
), "one text_grapheme is None and levenshtein can't be retrieved"
return 1 - levenshtein(s1, s2) / max(len(s1), len(s2))
@timeit
def compute_duplicate_types(
matches: pd.DataFrame,
threshold_grapheme=0.693,
threshold_language=0.715,
threshold_semantic=0.85,
):
"""
Distinguish 3 different duplicate types: translation, rewording & copypasta
Args :
matches (pd.DataFrame): dataframe of pairs of texts containing text_grapheme_source and text_grapheme_target columns for detecting copypasta and language_source and language_target for detecting translation
threshold_grapheme (float, optional): threshold to distinguish copypasta from rewording using levenshtein. Defaults to 0.693.
threshold_language (float, optional): threshold to detect translation. Defaults to 0.715.
threshold_semantic (float, optional): threshold to detect rewording. Defaults to 0.85.
Returns :
matches_strict (pd.DataFrame): dataframe containing 'copypasta', 'translation' and 'rewording' pairs of texts with score (cosine similarity from embeddings) and score_lev (score calculated from levenshtein) associated.
"""
assert ("text_grapheme_source" in matches.columns) & (
"text_grapheme_target" in matches.columns
), print(
"You need text_grapheme_source and text_grapheme_target columns in dataframe for Levenstein"
)
assert ("language_source" in matches.columns) & (
"language_target" in matches.columns
), print(
"You need language_source and language_target columns in dataframe for Levenstein"
)
matches["dup_type"] = "rewording"
matches.loc[
matches["language_source"] != matches["language_target"], "dup_type"
] = "translation"
matches.loc[matches.dup_type == "rewording", "score_lev"] = matches.loc[
matches.dup_type == "rewording"
].apply(
lambda x: similarity_levenshtein(
(x["text_grapheme_source"], x["text_grapheme_target"])
),
axis=1,
)
matches.loc[matches.score_lev > threshold_grapheme, "dup_type"] = "copy-pasta"
matches_strict = matches[
((matches.score > threshold_semantic) & (matches.dup_type == "rewording"))
| ((matches.score > threshold_language) & (matches.dup_type == "translation"))
| (matches.dup_type == "copy-pasta")
]
return matches_strict
def create_dataset_clusters(dataset: pd.DataFrame, edgelist: pd.DataFrame):
"""Give a cluster of duplicated content to all documents.
None if no duplicated content was found for a document
Args:
dataset (pd.DataFrame): dataframe containing each document and same index used to create embeddings and faiss index.
edgelist (pd.DataFrame): dataframe corresponding to pairs of texts and score associated
Return:
df_cluster (pd.DataFrame): dataframe with one row corresponding to one text and its cluster of duplicated content associated if it exists.
"""
df_cluster = dataset.copy()
consolidated_edgelist = edgelist.groupby(["source", "target"], as_index=False)[
"score"
].max()
clusters = list(
nx.connected_components(nx.from_pandas_edgelist(consolidated_edgelist))
)
clusters.sort(key=len, reverse=True)
for cluster_i, posts_indices in enumerate(clusters):
df_cluster.loc[list(posts_indices), "cluster"] = cluster_i
return df_cluster
def semantic_faiss(
df: pd.DataFrame,
min_size_txt: int = 30,
df_embeddings_use: pd.DataFrame = None,
embeddings_to_save: str = None,
threshold_grapheme: float = 0.693,
threshold_language: float = 0.715,
threshold_semantic=0.85,
remove_matches_same_user: str = None,
):
"""Apply end to end 3 delta methodology with faiss
Args:
df (pd.DataFrame): dataframe containing some columns :
- original: text original
- language (optional): language of each text. If not given, language detection is computed in order to detect translation
min_size_txt (int): minimal size of text in order to apply 3 delta. if texts too short, removing document.
df_embeddings_use (pd.DataFrame): embeddings dataframe already saved in order not to compute embeddings everytime.
embeddings_to_save (str): name of pickle to save the embeddings if the user wants to save the embeddings.
threshold_grapheme (float): threshold to detect copypasta with levenshtein on matches found with faiss. Defaults to 0.693.
threshold_language (float): threshold to find matches between 2 documents for translation. Defaults to 0.715.
threshold_semantic (float): threshold to find matches between 2 documents for rewording. Defaults to 0.85.
Return:
matches (pd.DataFrame): dataframe containing pairs of text detected as duplicate contents from 3 delta
df_cluster (pd.DataFrame): initial dataframe 'df' with its cluster of duplicated content associated if it exists.
"""
df = prepare_dataset(df, min_size_txt=min_size_txt)
if "language" not in df.columns:
print("language detection")
df = compute_language(df)
if df_embeddings_use is None:
df_embeddings_use = compute_embeddings(df)
if embeddings_to_save is not None:
df_embeddings_use.to_pickle(f"{embeddings_to_save}.pkl")
index_faiss = create_index_cosine(df_embeddings_use)
threshold_faiss = min(threshold_language, threshold_semantic)
res = find_matches(df_embeddings_use, index_faiss, threshold=threshold_faiss)
if remove_matches_same_user is not None:
columns_join = [
remove_matches_same_user,
"language",
"text_to_embed",
"text_grapheme",
]
else:
columns_join = ["language", "text_to_embed", "text_grapheme"]
matches = res.merge(
df[columns_join].add_suffix("_source"),
left_on="source",
right_index=True,
how="left",
).merge(
df[columns_join].add_suffix("_target"),
left_on="target",
right_index=True,
how="left",
)
matches = compute_duplicate_types(
matches,
threshold_grapheme=threshold_grapheme,
threshold_language=threshold_language,
threshold_semantic=threshold_semantic,
)
if remove_matches_same_user is not None:
matches = matches[
matches[remove_matches_same_user + "_source"]
!= matches[remove_matches_same_user + "_target"]
]
df_clusters = create_dataset_clusters(df, matches)
return matches, df_clusters

111250
notebooks/example_synthetic_dataset.ipynb ΠžΠ±Ρ‹Ρ‡Π½Ρ‹ΠΉ Ρ„Π°ΠΉΠ»

Π Π°Π·Π½ΠΈΡ†Π° ΠΌΠ΅ΠΆΠ΄Ρƒ Ρ„Π°ΠΉΠ»Π°ΠΌΠΈ Π½Π΅ ΠΏΠΎΠΊΠ°Π·Π°Π½Π° ΠΈΠ·-Π·Π° своСго большого Ρ€Π°Π·ΠΌΠ΅Ρ€Π° Π—Π°Π³Ρ€ΡƒΠ·ΠΈΡ‚ΡŒ Ρ€Π°Π·Π½ΠΈΡ†Ρƒ

25
pyproject.toml ΠžΠ±Ρ‹Ρ‡Π½Ρ‹ΠΉ Ρ„Π°ΠΉΠ»
ΠŸΡ€ΠΎΡΠΌΠΎΡ‚Ρ€Π΅Ρ‚ΡŒ Ρ„Π°ΠΉΠ»

@ -0,0 +1,25 @@
[tool.poetry]
name = "d3lta"
version = "1.0.0"
description = "d3lta package"
readme = "README.md"
authors = ["Viginum"]
[tool.poetry.dependencies]
python = "^3.10"
demoji = "1.1.0"
faiss-cpu = "1.9.0.post1"
fasttext = "0.9.3"
gensim = "4.3.3"
networkx = "2.8.8"
pandas = "2.2.3"
polyleven = "0.8"
scipy = "1.12.0"
tensorflow = "2.18.0"
tensorflow-hub = "0.16.1"
tensorflow-text = "2.18.1"
tqdm = "4.67.1"
[build-system]
requires = ["setuptools","poetry-core"]
build-backend = "poetry.core.masonry.api"

4
setup.py ΠžΠ±Ρ‹Ρ‡Π½Ρ‹ΠΉ Ρ„Π°ΠΉΠ»
ΠŸΡ€ΠΎΡΠΌΠΎΡ‚Ρ€Π΅Ρ‚ΡŒ Ρ„Π°ΠΉΠ»

@ -0,0 +1,4 @@
#!/usr/bin/env python
from setuptools import setup
setup()

Π”Π²ΠΎΠΈΡ‡Π½Ρ‹Π΅ Π΄Π°Π½Π½Ρ‹Π΅
static/graph.gif ΠžΠ±Ρ‹Ρ‡Π½Ρ‹ΠΉ Ρ„Π°ΠΉΠ»

Π”Π²ΠΎΠΈΡ‡Π½Ρ‹ΠΉ Ρ„Π°ΠΉΠ» Π½Π΅ отобраТаСтся.

ПослС

Π¨ΠΈΡ€ΠΈΠ½Π°:  |  Высота:  |  Π Π°Π·ΠΌΠ΅Ρ€: 10 MiB

82
tests/faissd3lta_test.py ΠžΠ±Ρ‹Ρ‡Π½Ρ‹ΠΉ Ρ„Π°ΠΉΠ»
ΠŸΡ€ΠΎΡΠΌΠΎΡ‚Ρ€Π΅Ρ‚ΡŒ Ρ„Π°ΠΉΠ»

@ -0,0 +1,82 @@
import os
import re
import sys
import pandas as pd
import pytest
from d3lta.faissd3lta import (
compute_embeddings,
compute_language,
create_index_cosine,
semantic_faiss,
)
@pytest.fixture
def examples_dataset():
"""Returns an empty test"""
return [
"Je m'apelle Mimie et je fais du stop",
"Je m'apelle Giselle et toi ?",
"Les chats sont gris",
"Cat's are grey, aren't they ?",
"Cats are grey",
"Les chats ne sont pas gris",
]
def test_compute_language(examples_dataset):
df_language = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
df_language = compute_language(df_language)
assert list(df_language["language"]) == ["fr", "fr", "fr", "en", "en", "fr"]
def test_embedding_similarity(examples_dataset):
df_test = pd.DataFrame(
examples_dataset,
columns=["text_to_embed"],
index=range(len(examples_dataset)),
) # index for checking that it has good ids
df_emb = compute_embeddings(df_test)
index_t = create_index_cosine(df_emb)
test_dataset = pd.DataFrame([{"text_to_embed": "I gatti sono grigi"}])
df_emb_test = compute_embeddings(test_dataset)
limits, distances, indices = index_t.range_search(
x=df_emb_test.to_numpy().reshape(1, -1), thresh=0.7
)
assert (
df_test.loc[indices]["text_to_embed"]
.str.contains("chat|cat", flags=re.IGNORECASE, na=False)
.all()
)
def test_semantic_faiss(examples_dataset):
df = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
df = compute_language(df)
df_emb = compute_embeddings(
df.assign(text_to_embed=lambda x: x["text_language_detect"])
)
df.index = df.index.astype(str)
matches, df_clusters = semantic_faiss(
df=df.rename(columns={"text_language_detect": "original"}),
min_size_txt=1,
df_embeddings_use=df_emb,
threshold_grapheme=0.693,
threshold_language=0.715,
threshold_semantic=0.85,
)
assert (
df_clusters.query("cluster == 0")["original"]
.str.contains("cat|chat", flags=re.IGNORECASE)
.all()
)
assert (
matches.query(
'text_to_embed_source == "Les chats sont gris" and text_to_embed_target == "Cats are grey"'
)["dup_type"]
== "translation"
).all()