🎉 First commit

2025-10-28 20:54:21 +02:00 · 2024-12-19 10:58:10 +01:00 · 2024-12-19 10:58:10 +01:00 · 858072c471
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,230 @@
+# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,git,linux
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,git,linux
+
+### Git ###
+# Created by git for backups. To disable backups in Git:
+# $ git config --global mergetool.keepBackup false
+*.orig
+
+# Created by git when using merge tools for conflicts
+*.BACKUP.*
+*.BASE.*
+*.LOCAL.*
+*.REMOTE.*
+*_BACKUP_*.txt
+*_BASE_*.txt
+*_LOCAL_*.txt
+*_REMOTE_*.txt
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,git,linux
+
+*.pkl
+*.ftz
+use_model_kaggle/
+include/
--- a/LICENSE.txt
+++ b/LICENSE.txt
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 VIGINUM
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,226 @@
+<h2 align="center"> <a href="https://arxiv.org/abs/2312.17338">D3lta</a></h2>
+
+<h5 align="center"> 
+
+If you like our project, please give us a star ⭐ on GitHub for the latest update.  </h2>
+
+</h5>
+
+<div align=center>
+  
+[![arXiv](https://img.shields.io/badge/Arxiv-2312.17338-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2312.17338) 
+
+This repository is the official implementation of D3lta, a library for detecting duplicate verbatim contents within a vast amount of documents.
+
+It distinguishes 3 types of duplicate contents : copypasta (almost exact duplicates), rewording and translation. You can run it on CPU.
+</div>
+
+---
+
+<img style="display: block; margin: auto;" src="./static/graph.gif"/>
+
+
+## 💻 Installing 
+
+Clone the repository
+
+```bash
+git clone https://github.com/VIGINUM-FR/D3lta
+```
+
+Navigate to the project
+
+```bash
+cd D3lta
+```
+
+Install the package
+
+```bash
+pip install -e .
+```
+
+## 🚀 Quick start
+
+One can use directly `semantic_faiss` function from a Dataframe that contains texts.
+We use by default the embeddings from the [Universal Sentence Encoder](https://www.kaggle.com/models/google/universal-sentence-encoder/tensorFlow1/lite/2)
+but one can use other models to calculate embeddings.
+
+
+```python
+import pandas as pd
+from d3lta.faissd3lta import *
+
+examples_dataset = [
+        "Je m'apelle Mimie et je fais du stop",
+        "Je m'apelle Giselle et toi ?",
+        "Les chats sont gris",
+        "Cat's are grey, aren't they ?",
+        "Cats are grey",
+        "Les chats ne sont pas gris",
+    ]
+df = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
+df.index = df.index.astype(str)
+
+matches, df_clusters = semantic_faiss(
+    df=df.rename(columns={"text_language_detect": "original"}),
+    min_size_txt=10,
+    embeddings_to_save='myembeddings',
+    threshold_grapheme=0.693,
+    threshold_language=0.715,
+    threshold_semantic=0.85,
+)
+
+>>>matches
+
+  source target     score duplicates language_source           text_to_embed_source  text_grapheme_source language_target           text_to_embed_target   text_grapheme_target     dup_type  score_lev
+0      2      3  0.745741        2-3              fr            Les chats sont gris      leschatssontgris              en  Cat's are grey, aren't they ?   catsaregreyarentthey  translation        NaN
+1      2      4  0.955517        2-4              fr            Les chats sont gris      leschatssontgris              en                  Cats are grey            catsaregrey  translation        NaN
+2      2      5  0.808805        2-5              fr            Les chats sont gris      leschatssontgris              fr     Les chats ne sont pas gris  leschatsnesontpasgris   copy-pasta   0.761905
+5      3      5  0.833525        3-5              en  Cat's are grey, aren't they ?  catsaregreyarentthey              fr     Les chats ne sont pas gris  leschatsnesontpasgris  translation        NaN
+8      4      5  0.767601        4-5              en                  Cats are grey           catsaregrey              fr     Les chats ne sont pas gris  leschatsnesontpasgris  translation        NaN
+
+>>>df_clusters
+                               original language                 text_grapheme                         text_to_embed                  text_language_detect  cluster
+0  Je m'apelle Mimie et je fais du stop       fr  jemapellemimieetjefaisdustop  Je m'apelle Mimie et je fais du stop  Je m'apelle Mimie et je fais du stop      NaN
+1          Je m'apelle Giselle et toi ?       fr         jemapellegiselleettoi          Je m'apelle Giselle et toi ?          Je m'apelle Giselle et toi ?      NaN
+2                   Les chats sont gris       fr              leschatssontgris                   Les chats sont gris                   Les chats sont gris      0.0
+3         Cat's are grey, aren't they ?       en          catsaregreyarentthey         Cat's are grey, aren't they ?         Cat's are grey, aren't they ?      0.0
+4                         Cats are grey       en                   catsaregrey                         Cats are grey                         Cats are grey      0.0
+5            Les chats ne sont pas gris       fr         leschatsnesontpasgris            Les chats ne sont pas gris            Les chats ne sont pas gris      0.0
+```
+
+Its also possible to use [Faiss](https://github.com/facebookresearch/faiss) to find similar embeddings.
+
+```python
+import pandas as pd
+from d3lta.faissd3lta import *
+
+examples_dataset = [
+        "Je m'apelle Mimie et je fais du stop",
+        "Je m'apelle Giselle et toi ?",
+        "Les chats sont gris",
+        "Cat's are grey, aren't they ?",
+        "Cats are grey",
+        "Les chats ne sont pas gris",
+    ]
+    
+df_test = pd.DataFrame(
+        examples_dataset,
+        columns=["text_to_embed"],
+        index=range(len(examples_dataset)),
+    )  # index for checking that it has good ids
+ df_emb = compute_embeddings(df_test)
+ index_t = create_index_cosine(df_emb)
+
+ test_dataset = pd.DataFrame([{"text_to_embed": "I gatti sono grigi"}])
+ df_emb_test = compute_embeddings(test_dataset)
+
+ limits, distances, indices = index_t.range_search(
+     x=df_emb_test.to_numpy().reshape(1, -1), thresh=0.7
+ )
+
+>>>df_test.loc[indices]["text_to_embed"]
+
+2              Les chats sont gris
+3    Cat's are grey, aren't they ?
+4                    Cats are grey
+5       Les chats ne sont pas gris
+Name: text_to_embed, dtype: object
+```
+
+It is also possible to use your own embedding (other than Universal Sentence Encoder). For example: 
+
+```python
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+from d3lta.faissd3lta import *
+
+examples_dataset = [
+        "Je m'apelle Mimie et je fais du stop",
+        "Je m'apelle Giselle et toi ?",
+        "Les chats sont gris",
+        "Cat's are grey, aren't they ?",
+        "Cats are grey",
+        "Les chats ne sont pas gris",
+    ]
+df = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
+df.index = df.index.astype(str)
+
+model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
+new_emb = model.encode(df['text_language_detect'].values.tolist())
+df_emb = pd.DataFrame(new_emb, index=df.index)
+
+matches, df_clusters = semantic_faiss(
+    df=df.rename(columns={"text_language_detect": "original"}),
+    min_size_txt=10,
+    df_embeddings_use=df_emb,
+    threshold_grapheme=0.693,
+    threshold_language=0.715,
+    threshold_semantic=0.85,
+)
+
+matches
+```
+
+
+
+## 📚 Synthetic dataset
+
+The dataset is available in the release `1.0.0`. It contains the following files:
+
+### `synthetic_dataset_documents.csv`:
+
+This file contains all seeds (real and original texts) and their generated variations (copy-pasta, rewording or translations). 
+There are 2985 documents in this corpus dataset generated using a large language model.
+
+Columns details:
+- doc_id (int): unique number associated to each text. All seed index are multiples of 10 and followed by their 9 transformations.
+- original (str): real or transformed text
+- text_type (str): dataset where the seed was extracted (`books`, `news`, `tweets`)
+- language (str): language of the text
+- prompt (str): prompt given to ChatGPT for "copypasta" and "rewording"
+- seed (bool): True if the text is one of the 300 initial texts from which the generation is from
+
+The 300 initial texts (seeds) have been taken from three Kaggle datasets : 
+- https://www.kaggle.com/competitions/nlp-getting-started/data
+- https://www.kaggle.com/datasets/abireltaief/books-reviews
+- https://www.kaggle.com/datasets/rmisra/news-category-dataset
+
+(For more info, please refer to the [paper](https://arxiv.org/abs/2312.17338))
+
+### `synthetic_dataset_pairs_unbalanced.csv`:
+
+This file contains the 1,497,547 annotated pairs of text of the synthetic dataset : 4,500 pairs of translation, 4,030 pairs of copy-pasta, 4017 pairs of rewording and 1,485,000 pairs of non duplicated content called "nomatch".
+
+Column details: 
+- source_target (str): unique id for the pair.
+- source (int): index of the first text of the pair in the synthetic_dataset_documents.csv
+- target (int): index of the second text of the pair in the synthetic_dataset_documents.csv
+- original_source (str): text of the source index
+- original_target (str): text of the target index
+- language_source (str): language of original_source
+- language_target (str): language of original_target
+- true_label (str): transformation relation that links both text of the pair i.e. the source and target texts are {true_label} of each other. The true_label can be "copypasta", "rewording" or "translation"
+
+## Notebooks
+
+In folder the [`notebooks`](./notebooks/), you can find: 
+- [`example_synthetic_dataset.ipynb`](./notebooks/example_synthetic_dataset.ipynb): Example of applying threedelta methodology to the synthetic dataset, with a comparison to the true labels.
+
+
+## Citation
+
+If you find our paper and code useful in your research, please consider giving a star 🌟  and a citation 📝:
+
+```BibTeX
+@misc{richard2023unmasking,
+      title={Unmasking information manipulation: A quantitative approach to detecting Copy-pasta, Rewording, and Translation on Social Media}, 
+      author={Manon Richard and Lisa Giordani and Cristian Brokate and Jean Liénard},
+      year={2023},
+      eprint={2312.17338},
+      archivePrefix={arXiv},
+      primaryClass={cs.SI},
+      url={https://arxiv.org/abs/2312.17338}, 
+}
+```
--- a/d3lta/init.py
+++ b/d3lta/init.py
--- a/d3lta/faissd3lta.py
+++ b/d3lta/faissd3lta.py
@ -0,0 +1,575 @@
+from functools import wraps
+import os
+import re
+import time
+from typing import Union
+import demoji
+import faiss
+import fasttext
+from gensim.utils import deaccent
+import networkx as nx
+import numpy as np
+import pandas as pd
+from polyleven import levenshtein
+import requests
+import tensorflow as tf
+import tensorflow_hub as hub
+import tensorflow_text
+from tqdm.contrib.concurrent import thread_map
+from tqdm.auto import trange
+import networkx as nx
+
+
+def timeit(func):
+    @wraps(func)
+    def timeit_wrapper(*args, **kwargs):
+        start_time = time.time()
+        print(f">>> Start {func.__name__}")
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        total_time = end_time - start_time
+        if total_time < 60:
+            print(f"<<< End {func.__name__}, Took: {total_time:.4f} sec")
+        else:
+            print(f"<<< End {func.__name__}, Took:{np.round((total_time)/60, 1)} min")
+        return result
+
+    return timeit_wrapper
+
+
+def grouper(iterable, n):
+    """A (lazy) iterator that chunks `iterable` into lists of `n`"""
+    for i in range(0, len(iterable), n):
+        yield iterable[i : i + n]
+
+
+###############################
+#### Preprocessing Dataset ####
+###############################
+
+
+def preprocess_text(
+    s,
+    lower=True,
+    remove_accents=True,
+    remove_urls=True,
+    remove_mentions=True,
+    remove_emojis=True,
+    remove_hashtags_frontend=False,
+    remove_twitter_cropend=False,
+    replace_newline_characters=True,
+    remove_punctuation=False,
+):
+    """
+    clean a list-like of strings, performing all the following treatments by default
+    Args:
+        s (list-like of strings): input list-like of strings
+        lower (bool, optional): lowercase the text. Defaults to True.
+        remove_accents (bool, optional): deaccent the text. Defaults to True.
+        remove_urls (bool, optional): remove urls from the text. Defaults to True.
+        remove_mentions (bool, optional): remove mentions from the text. Defaults to True.
+        remove_emojis (bool, optional): remove emojis from the text. Defaults to True.
+        remove_hashtags_frontend (bool, optional): remove leading and ending hashtags from the text. Defaults to False.
+        remove_twitter_cropend (bool, optional): remove Twitter-added "…" character at the end of messages that are too long. Defaults to False.
+        replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True.
+        remove_punctuation (bool, optional): remove punctuation from the text, be careful, it will remove # of hashtags too. Defaults to False.
+    """
+    if s is None:
+        s = ""
+
+    assert isinstance(s, (str, list, pd.Series, set, frozenset))
+
+    if isinstance(s, str):
+        encapsulated = True
+        s = [s]
+    else:
+        encapsulated = False
+    if lower:
+        s = [msg.lower() for msg in s]
+    if remove_accents:
+        s = [deaccent(msg) for msg in s]
+    if remove_urls:
+        match_url_regexp = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
+        s = [re.sub(match_url_regexp, "", msg, flags=re.MULTILINE).strip() for msg in s]
+    if remove_mentions:
+        match_mentions_regexp = r"(@[a-zA-Z0-9_]+)"
+        s = [
+            re.sub(match_mentions_regexp, "", msg, flags=re.MULTILINE).strip()
+            for msg in s
+        ]
+    if remove_twitter_cropend:
+        match_croppedmsg_regexp = r"([^\s]+…)$"
+        s = [
+            re.sub(match_croppedmsg_regexp, "", msg, flags=re.MULTILINE).strip()
+            for msg in s
+        ]
+    if remove_emojis:
+        s = [demoji.replace(msg, "").strip() for msg in s]
+
+    if remove_hashtags_frontend:
+        if (not remove_urls) or (not remove_mentions):
+            print(
+                "Not all leading and ending hashtags might be removed because there might be mentions or urls"
+            )
+        match_hashtags_begin = r"(#\S+ ?)+"
+        match_hashtags_end = r"(\S+# ?)+"
+        match_hashtags_frontend = f"^({match_hashtags_begin})|^({match_hashtags_end})|({match_hashtags_begin})$|({match_hashtags_end})$"
+        s = [re.sub(match_hashtags_frontend, "", msg).strip() for msg in s]
+    if replace_newline_characters:
+        match_escapes_regexp = r"(\n|\r)+"
+        s = [
+            re.sub(
+                r"\s+", " ", re.sub(match_escapes_regexp, ". ", msg, flags=re.MULTILINE)
+            ).strip()
+            for msg in s
+        ]
+    if remove_punctuation:
+        match_punctuations = r"[^\w\s]"
+        s = [
+            re.sub(r"\s+", " ", re.sub(match_punctuations, " ", msg)).strip()
+            for msg in s
+        ]
+    if encapsulated:
+        return s[0].strip()
+    else:
+        return s
+
+
+@timeit
+def prepare_dataset(dataset: Union[pd.Series, pd.DataFrame], min_size_txt: int = 30):
+    """
+    Create new columns of preprocessed texts from original text for distance comparison with 3 delta method
+    Args:
+        dataset (Union[pd.Series, pd.DataFrame]): dataframe or series containing a column "original" with the text. Optional: a column "language" can be given, otherwise language detection is implemented.
+        min_size_txt (Optional[int], optional): size of text that should'nt taken into account for duplicate content because too small. If set to None, no text is removed. Defaults to {default_min_size}.
+    Returns:
+        dataset (pd.DataFrame): The same input dataset with new columns added (text_grapheme, text_to_embed, text_language_detect), containing the preprocessed texts for 3 delta method.
+    """
+    assert isinstance(
+        dataset, (pd.Series, pd.DataFrame)
+    ), "dataset must be a pd.Series or a pd.DataFrame"
+
+    assert dataset.index.nunique() == len(
+        dataset
+    ), "dataset must be indexed with unique indices"
+
+    assert all(
+        [isinstance(i, str) for i in dataset.index]
+    ), "dataset indices must be `str`"
+
+    if isinstance(dataset, pd.DataFrame):
+        assert (
+            "original" in dataset.columns
+        ), "when dataset is a pd.DataFrame, it must have a column named 'original'"
+
+    if isinstance(dataset, pd.Series):
+        dataset = dataset.to_frame("original")
+
+    # text_grapheme is used for grapheme distance (Levenshtein)
+    # this is the cleanest version with no spaces
+    if "text_grapheme" not in dataset.columns:
+        dataset["text_grapheme"] = [
+            t.replace(" ", "")
+            for t in preprocess_text(
+                dataset["original"],
+                lower=True,
+                remove_accents=True,
+                remove_urls=True,
+                remove_mentions=True,
+                remove_emojis=True,
+                remove_hashtags_frontend=True,
+                remove_twitter_cropend=False,
+                replace_newline_characters=True,
+                remove_punctuation=True,
+            )
+        ]
+
+    # text_to_embed is used for semantic distance and embedded with USE
+    # links are removed
+    if "text_to_embed" not in dataset.columns:
+        dataset["text_to_embed"] = preprocess_text(
+            dataset["original"],
+            lower=False,
+            remove_accents=False,
+            remove_urls=True,
+            remove_mentions=True,
+            remove_emojis=False,
+            remove_hashtags_frontend=False,
+            remove_twitter_cropend=False,
+            replace_newline_characters=False,
+            remove_punctuation=False,
+        )
+    # text_language_detect is used for fasttext
+    # accents and emojis are kept as they provide interesting cues to language
+    if ("language" not in dataset.columns) or (
+        "text_language_detect" not in dataset.columns
+    ):
+        dataset["text_language_detect"] = preprocess_text(
+            dataset["original"],
+            lower=False,
+            remove_accents=False,
+            remove_urls=True,
+            remove_mentions=True,
+            remove_emojis=True,
+            remove_hashtags_frontend=True,
+            remove_twitter_cropend=False,
+            replace_newline_characters=True,
+            remove_punctuation=False,
+        )
+    print("Done.")
+    print("")
+
+    if min_size_txt is not None:
+        print(
+            f'Removing {(dataset["text_grapheme"].str.len() < min_size_txt).sum()} short texts over {len(dataset)} sentences...'
+        )
+        dataset = dataset.loc[dataset["text_grapheme"].str.len() >= min_size_txt]
+        print("Done.")
+
+    return dataset
+
+
+@timeit
+def compute_language(
+    dataset: pd.DataFrame,
+    fasttext_model=None,
+    batch_size: int = 100,
+    max_workers: int = 8,
+):
+    """
+    Compute language detection in order to detect translation
+    Args :
+        dataset (pd.DataFrame): dataframe containing the column "text_language_detect" with the text to be analyzed
+        fasttext_model (Optional[any], optional): optional, if another model than fasttext is to be used, otherwise, fasttext is uploaded. Defaults to None.
+        batch_size (int, optional): batch size of text to be retrieved each step by parallelization. Defaults to 100.
+        max_workers (int, optional): number of workers for parallelization. Defaults to 8.
+    Returns:
+        dataset (pd.DataFrame): The same input dataset with column 'language' added containing the results of language detection.
+    """
+    assert (
+        "text_language_detect" in dataset.columns
+    ), "you need to have a column text_language_detect to detect language"
+
+    if fasttext_model is None:
+        if os.path.exists("lid.176.ftz"):
+            print("Loading fastext model from local file...")
+            fasttext_model = fasttext.load_model("lid.176.ftz")
+        else:
+            print("Downloading fastext model from website and saving locally...")
+            r = requests.get(
+                "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
+            )
+            with open("lid.176.ftz", "wb") as f:
+                f.write(r.content)
+            fasttext_model = fasttext.load_model("lid.176.ftz")
+        print("Done.\n")
+
+    def process_chunk_fasttext(text_chunk, threshold=0.5):
+        preds = fasttext_model.predict(text_chunk.tolist(), k=1)
+        preds = [
+            lng[0][-2:] if score[0] > threshold else ""
+            for lng, score in zip(preds[0], preds[1])
+        ]
+        return preds
+
+    batch_size = batch_size
+    chunk_fasttext = thread_map(
+        process_chunk_fasttext,
+        grouper(dataset["text_language_detect"], batch_size),
+        max_workers=max_workers,
+        total=len(dataset) // batch_size,
+    )
+
+    dataset["language"] = np.concatenate(chunk_fasttext)
+    return dataset
+
+
+#############################
+#### Compute Embeddings  ####
+#############################
+
+
+def download_USE(
+    use_url="https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3",
+):
+    use_model = hub.load(use_url)
+    tf.saved_model.save(use_model, "use_model_kaggle")
+    return use_model
+
+
+@timeit
+def compute_embeddings(df, batch_size: int = 100, max_workers: int = 8):
+    """
+    Compute embeddings for distance comparison
+    Args:
+        df (pd.DataFrame): dataframe containing the column "text_to_embed" with the text to be embedded
+        batch_size (int, optional): batch size of text to be retrieved each step by parallelization. Defaults to 100.
+        max_workers (int, optional): number of workers for parallelization. Defaults to 8.
+    Returns:
+        dataset (pd.DataFrame): A dataset with new columns added containing the results of embeddings computation.
+    """
+    assert "text_to_embed" in df.columns, print(
+        "You need to compute text_to_embed columns"
+    )
+    use_model = download_USE()
+
+    def process_chunk_use(text_chunk):
+        return pd.DataFrame(
+            use_model(text_chunk).numpy(),
+            index=text_chunk.index,
+            columns=[f"USE:{i}" for i in range(512)],
+        )
+
+    batch_size = batch_size
+    chunk_use = thread_map(
+        process_chunk_use,
+        grouper(df["text_to_embed"], batch_size),
+        max_workers=max_workers,
+        total=len(df) // batch_size,
+    )
+    dataset = pd.concat([pd.concat(chunk_use, axis=0)], axis=1)
+    dataset.index = df.index
+    return dataset
+
+
+@timeit
+def create_index_cosine(df_embeddings: pd.DataFrame):
+    """ "
+    Create index with faiss for faster cosine distance computation
+    Args:
+        df_embeddings (pd.DataFrame): dataframe containing the embeddings
+    Returns:
+        index: A faiss index which can be used to compute cosine distances more efficiently
+    """
+    embeddings = df_embeddings.to_numpy()
+    ids = list(df_embeddings.index)
+
+    # cosine similarity index...
+    vector_dimension = embeddings.shape[1]
+    index_flat = faiss.IndexFlat(vector_dimension, faiss.METRIC_INNER_PRODUCT)
+    # ...encapsulated in another index in order to have posts ids
+    index = faiss.IndexIDMap(index_flat)
+
+    # for cosine similarity, need of normalisation
+    try:
+        faiss.normalize_L2(embeddings)
+    except:
+        embeddings = embeddings.copy(order="C")
+        faiss.normalize_L2(embeddings)
+        print("C contiguous problem solved")
+
+    # add embeddings & ids
+    index.add_with_ids(embeddings, ids)
+    return index
+
+
+@timeit
+def find_matches(
+    df_embeddings_search: pd.DataFrame,
+    index,
+    threshold: float = 0.7,
+    batch_size: int = 100,
+    verbose=True,
+):
+    """
+    Compute pairwise cosine similarity between all docs in index between a subset of docs and all docs in index
+    Args :
+        df_embeddings_search (pd.DataFrame): dataframe containing embeddings we want to find similarity with in the faiss index
+        index: faiss index
+        threshold (float, optional): threshold for similarity. Defaults to 0.7.
+        batch_size (int, optional): number of vector per batch. Defaults to 100.
+    Returns :
+        matches (pd.DataFrame): A dataframe of pairs of duplicated texts with cosine score associated.
+    """
+    list_indices = []
+    for i_batch in trange(
+        0, len(df_embeddings_search), batch_size, disable=not verbose
+    ):
+        limits, distances, indices = index.range_search(
+            df_embeddings_search.iloc[i_batch : i_batch + batch_size].to_numpy(),
+            thresh=threshold,
+        )
+        for lim in range(len(limits) - 1):
+            source = df_embeddings_search.index[i_batch + lim]
+            for target, score in zip(
+                indices[limits[lim] : limits[lim + 1]],
+                distances[limits[lim] : limits[lim + 1]],
+            ):
+                if str(target) != str(source):  # doesn't match with its own embedding
+                    list_indices.append([str(source), str(target), score])
+
+    # create matches dataframe
+    matches = pd.DataFrame(list_indices, columns=["source", "target", "score"])
+    # drop duplicates because we have A-B and B-A
+    matches["duplicates"] = matches.apply(
+        lambda row: str(min(row["source"], row["target"]))
+        + "-"
+        + str(max(row["source"], row["target"])),
+        axis=1,
+    )
+    matches = matches.drop_duplicates("duplicates")
+    return matches
+
+
+def similarity_levenshtein(pair):
+    s1, s2 = pair
+    assert (
+        min(len(s1), len(s2)) > 0
+    ), "one text_grapheme is None and levenshtein can't be retrieved"
+    return 1 - levenshtein(s1, s2) / max(len(s1), len(s2))
+
+
+@timeit
+def compute_duplicate_types(
+    matches: pd.DataFrame,
+    threshold_grapheme=0.693,
+    threshold_language=0.715,
+    threshold_semantic=0.85,
+):
+    """
+    Distinguish 3 different duplicate types: translation, rewording & copypasta
+    Args :
+        matches (pd.DataFrame): dataframe of pairs of texts containing text_grapheme_source and text_grapheme_target columns for detecting copypasta and language_source and language_target for detecting translation
+        threshold_grapheme (float, optional): threshold to distinguish copypasta from rewording using levenshtein. Defaults to 0.693.
+        threshold_language (float, optional): threshold to detect translation. Defaults to 0.715.
+        threshold_semantic (float, optional): threshold to detect rewording. Defaults to 0.85.
+    Returns :
+        matches_strict (pd.DataFrame): dataframe containing 'copypasta', 'translation' and 'rewording' pairs of texts with score (cosine similarity from embeddings) and score_lev (score calculated from levenshtein) associated.
+    """
+    assert ("text_grapheme_source" in matches.columns) & (
+        "text_grapheme_target" in matches.columns
+    ), print(
+        "You need text_grapheme_source and text_grapheme_target columns in dataframe for Levenstein"
+    )
+
+    assert ("language_source" in matches.columns) & (
+        "language_target" in matches.columns
+    ), print(
+        "You need language_source and language_target columns in dataframe for Levenstein"
+    )
+
+    matches["dup_type"] = "rewording"
+    matches.loc[
+        matches["language_source"] != matches["language_target"], "dup_type"
+    ] = "translation"
+
+    matches.loc[matches.dup_type == "rewording", "score_lev"] = matches.loc[
+        matches.dup_type == "rewording"
+    ].apply(
+        lambda x: similarity_levenshtein(
+            (x["text_grapheme_source"], x["text_grapheme_target"])
+        ),
+        axis=1,
+    )
+    matches.loc[matches.score_lev > threshold_grapheme, "dup_type"] = "copy-pasta"
+
+    matches_strict = matches[
+        ((matches.score > threshold_semantic) & (matches.dup_type == "rewording"))
+        | ((matches.score > threshold_language) & (matches.dup_type == "translation"))
+        | (matches.dup_type == "copy-pasta")
+    ]
+
+    return matches_strict
+
+
+def create_dataset_clusters(dataset: pd.DataFrame, edgelist: pd.DataFrame):
+    """Give a cluster of duplicated content to all documents.
+
+    None if no duplicated content was found for a document
+    Args:
+        dataset (pd.DataFrame): dataframe containing each document and same index used to create embeddings and faiss index.
+        edgelist (pd.DataFrame): dataframe corresponding to pairs of texts and score associated
+    Return:
+        df_cluster (pd.DataFrame): dataframe with one row corresponding to one text and its cluster of duplicated content associated if it exists.
+    """
+    df_cluster = dataset.copy()
+    consolidated_edgelist = edgelist.groupby(["source", "target"], as_index=False)[
+        "score"
+    ].max()
+    clusters = list(
+        nx.connected_components(nx.from_pandas_edgelist(consolidated_edgelist))
+    )
+    clusters.sort(key=len, reverse=True)
+    for cluster_i, posts_indices in enumerate(clusters):
+        df_cluster.loc[list(posts_indices), "cluster"] = cluster_i
+    return df_cluster
+
+
+def semantic_faiss(
+    df: pd.DataFrame,
+    min_size_txt: int = 30,
+    df_embeddings_use: pd.DataFrame = None,
+    embeddings_to_save: str = None,
+    threshold_grapheme: float = 0.693,
+    threshold_language: float = 0.715,
+    threshold_semantic=0.85,
+    remove_matches_same_user: str = None,
+):
+    """Apply end to end 3 delta methodology with faiss
+    Args:
+        df (pd.DataFrame): dataframe containing some columns :
+            - original: text original
+            - language (optional): language of each text. If not given, language detection is computed in order to detect translation
+        min_size_txt (int): minimal size of text in order to apply 3 delta. if texts too short, removing document.
+        df_embeddings_use (pd.DataFrame): embeddings dataframe already saved in order not to compute embeddings everytime.
+        embeddings_to_save (str): name of pickle to save the embeddings if the user wants to save the embeddings.
+        threshold_grapheme (float): threshold to detect copypasta with levenshtein on matches found with faiss. Defaults to 0.693.
+        threshold_language (float): threshold to find matches between 2 documents for translation. Defaults to 0.715.
+        threshold_semantic (float): threshold to find matches between 2 documents for rewording. Defaults to 0.85.
+    Return:
+        matches (pd.DataFrame): dataframe containing pairs of text detected as duplicate contents from 3 delta
+        df_cluster (pd.DataFrame): initial dataframe 'df' with its cluster of duplicated content associated if it exists.
+    """
+
+    df = prepare_dataset(df, min_size_txt=min_size_txt)
+
+    if "language" not in df.columns:
+        print("language detection")
+        df = compute_language(df)
+
+    if df_embeddings_use is None:
+        df_embeddings_use = compute_embeddings(df)
+        if embeddings_to_save is not None:
+            df_embeddings_use.to_pickle(f"{embeddings_to_save}.pkl")
+
+    index_faiss = create_index_cosine(df_embeddings_use)
+
+    threshold_faiss = min(threshold_language, threshold_semantic)
+    res = find_matches(df_embeddings_use, index_faiss, threshold=threshold_faiss)
+
+    if remove_matches_same_user is not None:
+        columns_join = [
+            remove_matches_same_user,
+            "language",
+            "text_to_embed",
+            "text_grapheme",
+        ]
+    else:
+        columns_join = ["language", "text_to_embed", "text_grapheme"]
+    matches = res.merge(
+        df[columns_join].add_suffix("_source"),
+        left_on="source",
+        right_index=True,
+        how="left",
+    ).merge(
+        df[columns_join].add_suffix("_target"),
+        left_on="target",
+        right_index=True,
+        how="left",
+    )
+    matches = compute_duplicate_types(
+        matches,
+        threshold_grapheme=threshold_grapheme,
+        threshold_language=threshold_language,
+        threshold_semantic=threshold_semantic,
+    )
+
+    if remove_matches_same_user is not None:
+        matches = matches[
+            matches[remove_matches_same_user + "_source"]
+            != matches[remove_matches_same_user + "_target"]
+        ]
+
+    df_clusters = create_dataset_clusters(df, matches)
+
+    return matches, df_clusters
--- a/notebooks/example_synthetic_dataset.ipynb
+++ b/notebooks/example_synthetic_dataset.ipynb
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,25 @@
+[tool.poetry]
+name = "d3lta"
+version = "1.0.0"
+description = "d3lta package"
+readme = "README.md"
+authors = ["Viginum"]
+
+[tool.poetry.dependencies]
+python = "^3.10"
+demoji = "1.1.0"
+faiss-cpu = "1.9.0.post1"
+fasttext = "0.9.3"
+gensim = "4.3.3"
+networkx = "2.8.8"
+pandas = "2.2.3"
+polyleven = "0.8"
+scipy = "1.12.0"
+tensorflow = "2.18.0"
+tensorflow-hub = "0.16.1"
+tensorflow-text = "2.18.1"
+tqdm = "4.67.1"
+
+[build-system]
+requires = ["setuptools","poetry-core"]
+build-backend = "poetry.core.masonry.api"
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,4 @@
+#!/usr/bin/env python
+from setuptools import setup
+
+setup()
--- a/static/graph.gif
+++ b/static/graph.gif
--- a/tests/faissd3lta_test.py
+++ b/tests/faissd3lta_test.py
@ -0,0 +1,82 @@
+import os
+import re
+import sys
+
+import pandas as pd
+import pytest
+
+from d3lta.faissd3lta import (
+    compute_embeddings,
+    compute_language,
+    create_index_cosine,
+    semantic_faiss,
+)
+
+
+@pytest.fixture
+def examples_dataset():
+    """Returns an empty test"""
+    return [
+        "Je m'apelle Mimie et je fais du stop",
+        "Je m'apelle Giselle et toi ?",
+        "Les chats sont gris",
+        "Cat's are grey, aren't they ?",
+        "Cats are grey",
+        "Les chats ne sont pas gris",
+    ]
+
+
+def test_compute_language(examples_dataset):
+    df_language = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
+    df_language = compute_language(df_language)
+    assert list(df_language["language"]) == ["fr", "fr", "fr", "en", "en", "fr"]
+
+
+def test_embedding_similarity(examples_dataset):
+    df_test = pd.DataFrame(
+        examples_dataset,
+        columns=["text_to_embed"],
+        index=range(len(examples_dataset)),
+    )  # index for checking that it has good ids
+    df_emb = compute_embeddings(df_test)
+    index_t = create_index_cosine(df_emb)
+
+    test_dataset = pd.DataFrame([{"text_to_embed": "I gatti sono grigi"}])
+    df_emb_test = compute_embeddings(test_dataset)
+
+    limits, distances, indices = index_t.range_search(
+        x=df_emb_test.to_numpy().reshape(1, -1), thresh=0.7
+    )
+    assert (
+        df_test.loc[indices]["text_to_embed"]
+        .str.contains("chat|cat", flags=re.IGNORECASE, na=False)
+        .all()
+    )
+
+
+def test_semantic_faiss(examples_dataset):
+    df = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
+    df = compute_language(df)
+    df_emb = compute_embeddings(
+        df.assign(text_to_embed=lambda x: x["text_language_detect"])
+    )
+    df.index = df.index.astype(str)
+    matches, df_clusters = semantic_faiss(
+        df=df.rename(columns={"text_language_detect": "original"}),
+        min_size_txt=1,
+        df_embeddings_use=df_emb,
+        threshold_grapheme=0.693,
+        threshold_language=0.715,
+        threshold_semantic=0.85,
+    )
+    assert (
+        df_clusters.query("cluster == 0")["original"]
+        .str.contains("cat|chat", flags=re.IGNORECASE)
+        .all()
+    )
+    assert (
+        matches.query(
+            'text_to_embed_source == "Les chats sont gris" and text_to_embed_target == "Cats are grey"'
+        )["dup_type"]
+        == "translation"
+    ).all()