зеркало из
				https://github.com/VIGINUM-FR/D3lta.git
				synced 2025-10-31 05:56:20 +02:00 
			
		
		
		
	
		
			
				
	
	
		
			586 строки
		
	
	
		
			22 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			586 строки
		
	
	
		
			22 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os
 | |
| import re
 | |
| import time
 | |
| from functools import wraps
 | |
| from typing import Union
 | |
| 
 | |
| import faiss
 | |
| import fasttext
 | |
| import networkx as nx
 | |
| import numpy as np
 | |
| import pandas as pd
 | |
| import requests
 | |
| import tensorflow as tf
 | |
| import tensorflow_hub as hub
 | |
| 
 | |
| # import `tensorflow_text` ensures that some ops required by the USE model are available at runtime
 | |
| import tensorflow_text  # noqa: F401 # pylint: disable=unused-import
 | |
| from gensim.utils import deaccent
 | |
| from polyleven import levenshtein
 | |
| from tqdm.auto import trange
 | |
| from tqdm.contrib.concurrent import thread_map
 | |
| 
 | |
| from d3lta.emojis_remover import EmojisRemover, ExplicitUnicodeBlocksEmojisRemover
 | |
| 
 | |
| 
 | |
| def timeit(func):
 | |
|     @wraps(func)
 | |
|     def timeit_wrapper(*args, **kwargs):
 | |
|         start_time = time.time()
 | |
|         print(f">>> Start {func.__name__}")
 | |
|         result = func(*args, **kwargs)
 | |
|         end_time = time.time()
 | |
|         total_time = end_time - start_time
 | |
|         if total_time < 60:
 | |
|             print(f"<<< End {func.__name__}, Took: {total_time:.4f} sec")
 | |
|         else:
 | |
|             print(f"<<< End {func.__name__}, Took:{np.round((total_time) / 60, 1)} min")
 | |
|         return result
 | |
| 
 | |
|     return timeit_wrapper
 | |
| 
 | |
| 
 | |
| def grouper(iterable, n):
 | |
|     """A (lazy) iterator that chunks `iterable` into lists of `n`"""
 | |
|     for i in range(0, len(iterable), n):
 | |
|         yield iterable[i : i + n]
 | |
| 
 | |
| 
 | |
| ###############################
 | |
| #### Preprocessing Dataset ####
 | |
| ###############################
 | |
| 
 | |
| 
 | |
| def preprocess_text(
 | |
|     s: str | list[str] | set[str] | frozenset[str] | pd.Series,
 | |
|     lower: bool = True,
 | |
|     remove_accents: bool = True,
 | |
|     remove_urls: bool = True,
 | |
|     remove_mentions: bool = True,
 | |
|     remove_emojis: bool = True,
 | |
|     remove_hashtags_frontend: bool = False,
 | |
|     remove_twitter_cropend: bool = False,
 | |
|     replace_newline_characters: bool = True,
 | |
|     remove_punctuation: bool = False,
 | |
|     emojis_remover: EmojisRemover | None = None,
 | |
| ):
 | |
|     """
 | |
|     clean a list-like of strings, performing all the following treatments by default
 | |
|     Args:
 | |
|         s (list-like of strings): input list-like of strings
 | |
|         lower (bool, optional): lowercase the text. Defaults to True.
 | |
|         remove_accents (bool, optional): deaccent the text. Defaults to True.
 | |
|         remove_urls (bool, optional): remove urls from the text. Defaults to True.
 | |
|         remove_mentions (bool, optional): remove mentions from the text. Defaults to True.
 | |
|         remove_emojis (bool, optional): remove emojis and other pictograms from the text. Defaults to True.
 | |
|         remove_hashtags_frontend (bool, optional): remove leading and ending hashtags from the text. Defaults to False.
 | |
|         remove_twitter_cropend (bool, optional): remove Twitter-added "…" character at the end of messages that are too long. Defaults to False.
 | |
|         replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True.
 | |
|         remove_punctuation (bool, optional): remove punctuation from the text, be careful, it will remove # of hashtags too. Defaults to False.
 | |
|         emojis_remover (EmojisRemover, optional):
 | |
|             if provided, overrides the default engine used for emojis matching and removal.
 | |
|             Has no effect if `remove_emojis` is set to False.
 | |
|     """
 | |
|     if s is None:
 | |
|         s = ""
 | |
| 
 | |
|     if emojis_remover is None:
 | |
|         emojis_remover = ExplicitUnicodeBlocksEmojisRemover()
 | |
| 
 | |
|     assert isinstance(s, (str, list, pd.Series, set, frozenset))
 | |
| 
 | |
|     if isinstance(s, str):
 | |
|         encapsulated = True
 | |
|         s = [s]
 | |
|     else:
 | |
|         encapsulated = False
 | |
|     if lower:
 | |
|         s = [msg.lower() for msg in s]
 | |
|     if remove_accents:
 | |
|         s = [deaccent(msg) for msg in s]
 | |
|     if remove_urls:
 | |
|         match_url_regexp = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
 | |
|         s = [re.sub(match_url_regexp, "", msg, flags=re.MULTILINE).strip() for msg in s]
 | |
|     if remove_mentions:
 | |
|         match_mentions_regexp = r"(@[a-zA-Z0-9_]+)"
 | |
|         s = [
 | |
|             re.sub(match_mentions_regexp, "", msg, flags=re.MULTILINE).strip()
 | |
|             for msg in s
 | |
|         ]
 | |
|     if remove_twitter_cropend:
 | |
|         match_croppedmsg_regexp = r"([^\s]+…)$"
 | |
|         s = [
 | |
|             re.sub(match_croppedmsg_regexp, "", msg, flags=re.MULTILINE).strip()
 | |
|             for msg in s
 | |
|         ]
 | |
|     if remove_emojis:
 | |
|         s = [emojis_remover.remove_symbols(msg).strip() for msg in s]
 | |
| 
 | |
|     if remove_hashtags_frontend:
 | |
|         if (not remove_urls) or (not remove_mentions):
 | |
|             print(
 | |
|                 "Not all leading and ending hashtags might be removed because there might be mentions or urls"
 | |
|             )
 | |
|         match_hashtags_begin = r"(#\S+ ?)+"
 | |
|         match_hashtags_end = r"(\S+# ?)+"
 | |
|         match_hashtags_frontend = f"^({match_hashtags_begin})|^({match_hashtags_end})|({match_hashtags_begin})$|({match_hashtags_end})$"
 | |
|         s = [re.sub(match_hashtags_frontend, "", msg).strip() for msg in s]
 | |
|     if replace_newline_characters:
 | |
|         match_escapes_regexp = r"(\n|\r)+"
 | |
|         s = [
 | |
|             re.sub(
 | |
|                 r"\s+", " ", re.sub(match_escapes_regexp, ". ", msg, flags=re.MULTILINE)
 | |
|             ).strip()
 | |
|             for msg in s
 | |
|         ]
 | |
|     if remove_punctuation:
 | |
|         match_punctuations = r"[^\w\s]"
 | |
|         s = [
 | |
|             re.sub(r"\s+", " ", re.sub(match_punctuations, " ", msg)).strip()
 | |
|             for msg in s
 | |
|         ]
 | |
|     if encapsulated:
 | |
|         return s[0].strip()
 | |
|     else:
 | |
|         return s
 | |
| 
 | |
| 
 | |
| @timeit
 | |
| def prepare_dataset(dataset: Union[pd.Series, pd.DataFrame], min_size_txt: int = 30):
 | |
|     """
 | |
|     Create new columns of preprocessed texts from original text for distance comparison with 3 delta method
 | |
|     Args:
 | |
|         dataset (Union[pd.Series, pd.DataFrame]): dataframe or series containing a column "original" with the text. Optional: a column "language" can be given, otherwise language detection is implemented.
 | |
|         min_size_txt (Optional[int], optional): size of text that should'nt taken into account for duplicate content because too small. If set to None, no text is removed. Defaults to {default_min_size}.
 | |
|     Returns:
 | |
|         dataset (pd.DataFrame): The same input dataset with new columns added (text_grapheme, text_to_embed, text_language_detect), containing the preprocessed texts for 3 delta method.
 | |
|     """
 | |
|     assert isinstance(dataset, (pd.Series, pd.DataFrame)), (
 | |
|         "dataset must be a pd.Series or a pd.DataFrame"
 | |
|     )
 | |
| 
 | |
|     assert dataset.index.nunique() == len(dataset), (
 | |
|         "dataset must be indexed with unique indices"
 | |
|     )
 | |
| 
 | |
|     assert all([isinstance(i, str) for i in dataset.index]), (
 | |
|         "dataset indices must be `str`"
 | |
|     )
 | |
| 
 | |
|     if isinstance(dataset, pd.DataFrame):
 | |
|         assert "original" in dataset.columns, (
 | |
|             "when dataset is a pd.DataFrame, it must have a column named 'original'"
 | |
|         )
 | |
| 
 | |
|     if isinstance(dataset, pd.Series):
 | |
|         dataset = dataset.to_frame("original")
 | |
| 
 | |
|     # text_grapheme is used for grapheme distance (Levenshtein)
 | |
|     # this is the cleanest version with no spaces
 | |
|     if "text_grapheme" not in dataset.columns:
 | |
|         dataset["text_grapheme"] = [
 | |
|             t.replace(" ", "")
 | |
|             for t in preprocess_text(
 | |
|                 dataset["original"],
 | |
|                 lower=True,
 | |
|                 remove_accents=True,
 | |
|                 remove_urls=True,
 | |
|                 remove_mentions=True,
 | |
|                 remove_emojis=True,
 | |
|                 remove_hashtags_frontend=True,
 | |
|                 remove_twitter_cropend=False,
 | |
|                 replace_newline_characters=True,
 | |
|                 remove_punctuation=True,
 | |
|             )
 | |
|         ]
 | |
| 
 | |
|     # text_to_embed is used for semantic distance and embedded with USE
 | |
|     # links are removed
 | |
|     if "text_to_embed" not in dataset.columns:
 | |
|         dataset["text_to_embed"] = preprocess_text(
 | |
|             dataset["original"],
 | |
|             lower=False,
 | |
|             remove_accents=False,
 | |
|             remove_urls=True,
 | |
|             remove_mentions=True,
 | |
|             remove_emojis=False,
 | |
|             remove_hashtags_frontend=False,
 | |
|             remove_twitter_cropend=False,
 | |
|             replace_newline_characters=False,
 | |
|             remove_punctuation=False,
 | |
|         )
 | |
|     # text_language_detect is used for fasttext
 | |
|     # accents and emojis are kept as they provide interesting cues to language
 | |
|     if ("language" not in dataset.columns) or (
 | |
|         "text_language_detect" not in dataset.columns
 | |
|     ):
 | |
|         dataset["text_language_detect"] = preprocess_text(
 | |
|             dataset["original"],
 | |
|             lower=False,
 | |
|             remove_accents=False,
 | |
|             remove_urls=True,
 | |
|             remove_mentions=True,
 | |
|             remove_emojis=True,
 | |
|             remove_hashtags_frontend=True,
 | |
|             remove_twitter_cropend=False,
 | |
|             replace_newline_characters=True,
 | |
|             remove_punctuation=False,
 | |
|         )
 | |
|     print("Done.")
 | |
|     print("")
 | |
| 
 | |
|     if min_size_txt is not None:
 | |
|         print(
 | |
|             f"Removing {(dataset['text_grapheme'].str.len() < min_size_txt).sum()} short texts over {len(dataset)} sentences..."
 | |
|         )
 | |
|         dataset = dataset.loc[dataset["text_grapheme"].str.len() >= min_size_txt]
 | |
|         print("Done.")
 | |
| 
 | |
|     return dataset
 | |
| 
 | |
| 
 | |
| @timeit
 | |
| def compute_language(
 | |
|     dataset: pd.DataFrame,
 | |
|     fasttext_model=None,
 | |
|     batch_size: int = 100,
 | |
|     max_workers: int = 8,
 | |
| ):
 | |
|     """
 | |
|     Compute language detection in order to detect translation
 | |
|     Args :
 | |
|         dataset (pd.DataFrame): dataframe containing the column "text_language_detect" with the text to be analyzed
 | |
|         fasttext_model (Optional[any], optional): optional, if another model than fasttext is to be used, otherwise, fasttext is uploaded. Defaults to None.
 | |
|         batch_size (int, optional): batch size of text to be retrieved each step by parallelization. Defaults to 100.
 | |
|         max_workers (int, optional): number of workers for parallelization. Defaults to 8.
 | |
|     Returns:
 | |
|         dataset (pd.DataFrame): The same input dataset with column 'language' added containing the results of language detection.
 | |
|     """
 | |
|     assert "text_language_detect" in dataset.columns, (
 | |
|         "you need to have a column text_language_detect to detect language"
 | |
|     )
 | |
| 
 | |
|     if fasttext_model is None:
 | |
|         if os.path.exists("lid.176.ftz"):
 | |
|             print("Loading fastext model from local file...")
 | |
|             fasttext_model = fasttext.load_model("lid.176.ftz")
 | |
|         else:
 | |
|             print("Downloading fastext model from website and saving locally...")
 | |
|             r = requests.get(
 | |
|                 "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
 | |
|             )
 | |
|             with open("lid.176.ftz", "wb") as f:
 | |
|                 f.write(r.content)
 | |
|             fasttext_model = fasttext.load_model("lid.176.ftz")
 | |
|         print("Done.\n")
 | |
| 
 | |
|     def process_chunk_fasttext(text_chunk, threshold=0.5):
 | |
|         preds = fasttext_model.predict(text_chunk.tolist(), k=1)
 | |
|         preds = [
 | |
|             lng[0][-2:] if score[0] > threshold else ""
 | |
|             for lng, score in zip(preds[0], preds[1])
 | |
|         ]
 | |
|         return preds
 | |
| 
 | |
|     batch_size = batch_size
 | |
|     chunk_fasttext = thread_map(
 | |
|         process_chunk_fasttext,
 | |
|         grouper(dataset["text_language_detect"], batch_size),
 | |
|         max_workers=max_workers,
 | |
|         total=len(dataset) // batch_size,
 | |
|     )
 | |
| 
 | |
|     dataset["language"] = np.concatenate(chunk_fasttext)
 | |
|     return dataset
 | |
| 
 | |
| 
 | |
| #############################
 | |
| #### Compute Embeddings  ####
 | |
| #############################
 | |
| 
 | |
| 
 | |
| def download_USE(
 | |
|     use_url="https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3",
 | |
| ):
 | |
|     use_model = hub.load(use_url)
 | |
|     tf.saved_model.save(use_model, "use_model_kaggle")
 | |
|     return use_model
 | |
| 
 | |
| 
 | |
| @timeit
 | |
| def compute_embeddings(df, batch_size: int = 100, max_workers: int = 8):
 | |
|     """
 | |
|     Compute embeddings for distance comparison
 | |
|     Args:
 | |
|         df (pd.DataFrame): dataframe containing the column "text_to_embed" with the text to be embedded
 | |
|         batch_size (int, optional): batch size of text to be retrieved each step by parallelization. Defaults to 100.
 | |
|         max_workers (int, optional): number of workers for parallelization. Defaults to 8.
 | |
|     Returns:
 | |
|         dataset (pd.DataFrame): A dataset with new columns added containing the results of embeddings computation.
 | |
|     """
 | |
|     assert "text_to_embed" in df.columns, print(
 | |
|         "You need to compute text_to_embed columns"
 | |
|     )
 | |
|     use_model = download_USE()
 | |
| 
 | |
|     def process_chunk_use(text_chunk):
 | |
|         return pd.DataFrame(
 | |
|             use_model(text_chunk).numpy(),
 | |
|             index=text_chunk.index,
 | |
|             columns=[f"USE:{i}" for i in range(512)],
 | |
|         )
 | |
| 
 | |
|     batch_size = batch_size
 | |
|     chunk_use = thread_map(
 | |
|         process_chunk_use,
 | |
|         grouper(df["text_to_embed"], batch_size),
 | |
|         max_workers=max_workers,
 | |
|         total=len(df) // batch_size,
 | |
|     )
 | |
|     dataset = pd.concat([pd.concat(chunk_use, axis=0)], axis=1)
 | |
|     dataset.index = df.index
 | |
|     return dataset
 | |
| 
 | |
| 
 | |
| @timeit
 | |
| def create_index_cosine(df_embeddings: pd.DataFrame):
 | |
|     """ "
 | |
|     Create index with faiss for faster cosine distance computation
 | |
|     Args:
 | |
|         df_embeddings (pd.DataFrame): dataframe containing the embeddings
 | |
|     Returns:
 | |
|         index: A faiss index which can be used to compute cosine distances more efficiently
 | |
|     """
 | |
|     embeddings = df_embeddings.to_numpy()
 | |
|     ids = list(df_embeddings.index)
 | |
| 
 | |
|     # cosine similarity index...
 | |
|     vector_dimension = embeddings.shape[1]
 | |
|     index_flat = faiss.IndexFlat(vector_dimension, faiss.METRIC_INNER_PRODUCT)
 | |
|     # ...encapsulated in another index in order to have posts ids
 | |
|     index = faiss.IndexIDMap(index_flat)
 | |
| 
 | |
|     # for cosine similarity, need of normalisation
 | |
|     try:
 | |
|         faiss.normalize_L2(embeddings)
 | |
|     except:
 | |
|         embeddings = embeddings.copy(order="C")
 | |
|         faiss.normalize_L2(embeddings)
 | |
|         print("C contiguous problem solved")
 | |
| 
 | |
|     # add embeddings & ids
 | |
|     index.add_with_ids(embeddings, ids)
 | |
|     return index
 | |
| 
 | |
| 
 | |
| @timeit
 | |
| def find_matches(
 | |
|     df_embeddings_search: pd.DataFrame,
 | |
|     index,
 | |
|     threshold: float = 0.7,
 | |
|     batch_size: int = 100,
 | |
|     verbose=True,
 | |
| ):
 | |
|     """
 | |
|     Compute pairwise cosine similarity between all docs in index between a subset of docs and all docs in index
 | |
|     Args :
 | |
|         df_embeddings_search (pd.DataFrame): dataframe containing embeddings we want to find similarity with in the faiss index
 | |
|         index: faiss index
 | |
|         threshold (float, optional): threshold for similarity. Defaults to 0.7.
 | |
|         batch_size (int, optional): number of vector per batch. Defaults to 100.
 | |
|     Returns :
 | |
|         matches (pd.DataFrame): A dataframe of pairs of duplicated texts with cosine score associated.
 | |
|     """
 | |
|     list_indices = []
 | |
|     for i_batch in trange(
 | |
|         0, len(df_embeddings_search), batch_size, disable=not verbose
 | |
|     ):
 | |
|         limits, distances, indices = index.range_search(
 | |
|             df_embeddings_search.iloc[i_batch : i_batch + batch_size].to_numpy(),
 | |
|             thresh=threshold,
 | |
|         )
 | |
|         for lim in range(len(limits) - 1):
 | |
|             source = df_embeddings_search.index[i_batch + lim]
 | |
|             for target, score in zip(
 | |
|                 indices[limits[lim] : limits[lim + 1]],
 | |
|                 distances[limits[lim] : limits[lim + 1]],
 | |
|             ):
 | |
|                 if str(target) != str(source):  # doesn't match with its own embedding
 | |
|                     list_indices.append([str(source), str(target), score])
 | |
| 
 | |
|     # create matches dataframe
 | |
|     matches = pd.DataFrame(list_indices, columns=["source", "target", "score"])
 | |
|     # drop duplicates because we have A-B and B-A
 | |
|     matches["duplicates"] = matches.apply(
 | |
|         lambda row: str(min(row["source"], row["target"]))
 | |
|         + "-"
 | |
|         + str(max(row["source"], row["target"])),
 | |
|         axis=1,
 | |
|     )
 | |
|     matches = matches.drop_duplicates("duplicates")
 | |
|     return matches
 | |
| 
 | |
| 
 | |
| def similarity_levenshtein(pair):
 | |
|     s1, s2 = pair
 | |
|     assert min(len(s1), len(s2)) > 0, (
 | |
|         "one text_grapheme is None and levenshtein can't be retrieved"
 | |
|     )
 | |
|     return 1 - levenshtein(s1, s2) / max(len(s1), len(s2))
 | |
| 
 | |
| 
 | |
| @timeit
 | |
| def compute_duplicate_types(
 | |
|     matches: pd.DataFrame,
 | |
|     threshold_grapheme=0.693,
 | |
|     threshold_language=0.715,
 | |
|     threshold_semantic=0.85,
 | |
| ):
 | |
|     """
 | |
|     Distinguish 3 different duplicate types: translation, rewording & copypasta
 | |
|     Args :
 | |
|         matches (pd.DataFrame): dataframe of pairs of texts containing text_grapheme_source and text_grapheme_target columns for detecting copypasta and language_source and language_target for detecting translation
 | |
|         threshold_grapheme (float, optional): threshold to distinguish copypasta from rewording using levenshtein. Defaults to 0.693.
 | |
|         threshold_language (float, optional): threshold to detect translation. Defaults to 0.715.
 | |
|         threshold_semantic (float, optional): threshold to detect rewording. Defaults to 0.85.
 | |
|     Returns :
 | |
|         matches_strict (pd.DataFrame): dataframe containing 'copypasta', 'translation' and 'rewording' pairs of texts with score (cosine similarity from embeddings) and score_lev (score calculated from levenshtein) associated.
 | |
|     """
 | |
|     assert ("text_grapheme_source" in matches.columns) & (
 | |
|         "text_grapheme_target" in matches.columns
 | |
|     ), print(
 | |
|         "You need text_grapheme_source and text_grapheme_target columns in dataframe for Levenstein"
 | |
|     )
 | |
| 
 | |
|     assert ("language_source" in matches.columns) & (
 | |
|         "language_target" in matches.columns
 | |
|     ), print(
 | |
|         "You need language_source and language_target columns in dataframe for Levenstein"
 | |
|     )
 | |
| 
 | |
|     matches["dup_type"] = "rewording"
 | |
|     matches.loc[
 | |
|         matches["language_source"] != matches["language_target"], "dup_type"
 | |
|     ] = "translation"
 | |
| 
 | |
|     matches.loc[matches.dup_type == "rewording", "score_lev"] = matches.loc[
 | |
|         matches.dup_type == "rewording"
 | |
|     ].apply(
 | |
|         lambda x: similarity_levenshtein(
 | |
|             (x["text_grapheme_source"], x["text_grapheme_target"])
 | |
|         ),
 | |
|         axis=1,
 | |
|     )
 | |
|     matches.loc[matches.score_lev > threshold_grapheme, "dup_type"] = "copy-pasta"
 | |
| 
 | |
|     matches_strict = matches[
 | |
|         ((matches.score > threshold_semantic) & (matches.dup_type == "rewording"))
 | |
|         | ((matches.score > threshold_language) & (matches.dup_type == "translation"))
 | |
|         | (matches.dup_type == "copy-pasta")
 | |
|     ]
 | |
| 
 | |
|     return matches_strict
 | |
| 
 | |
| 
 | |
| def create_dataset_clusters(dataset: pd.DataFrame, edgelist: pd.DataFrame):
 | |
|     """Give a cluster of duplicated content to all documents.
 | |
| 
 | |
|     None if no duplicated content was found for a document
 | |
|     Args:
 | |
|         dataset (pd.DataFrame): dataframe containing each document and same index used to create embeddings and faiss index.
 | |
|         edgelist (pd.DataFrame): dataframe corresponding to pairs of texts and score associated
 | |
|     Return:
 | |
|         df_cluster (pd.DataFrame): dataframe with one row corresponding to one text and its cluster of duplicated content associated if it exists.
 | |
|     """
 | |
|     df_cluster = dataset.copy()
 | |
|     consolidated_edgelist = edgelist.groupby(["source", "target"], as_index=False)[
 | |
|         "score"
 | |
|     ].max()
 | |
|     clusters = list(
 | |
|         nx.connected_components(nx.from_pandas_edgelist(consolidated_edgelist))
 | |
|     )
 | |
|     clusters.sort(key=len, reverse=True)
 | |
|     for cluster_i, posts_indices in enumerate(clusters):
 | |
|         df_cluster.loc[list(posts_indices), "cluster"] = cluster_i
 | |
|     return df_cluster
 | |
| 
 | |
| 
 | |
| def semantic_faiss(
 | |
|     df: pd.DataFrame,
 | |
|     min_size_txt: int = 30,
 | |
|     df_embeddings_use: pd.DataFrame = None,
 | |
|     embeddings_to_save: str = None,
 | |
|     threshold_grapheme: float = 0.693,
 | |
|     threshold_language: float = 0.715,
 | |
|     threshold_semantic=0.85,
 | |
|     remove_matches_same_user: str = None,
 | |
| ):
 | |
|     """Apply end to end 3 delta methodology with faiss
 | |
|     Args:
 | |
|         df (pd.DataFrame): dataframe containing some columns :
 | |
|             - original: text original
 | |
|             - language (optional): language of each text. If not given, language detection is computed in order to detect translation
 | |
|         min_size_txt (int): minimal size of text in order to apply 3 delta. if texts too short, removing document.
 | |
|         df_embeddings_use (pd.DataFrame): embeddings dataframe already saved in order not to compute embeddings everytime.
 | |
|         embeddings_to_save (str): name of pickle to save the embeddings if the user wants to save the embeddings.
 | |
|         threshold_grapheme (float): threshold to detect copypasta with levenshtein on matches found with faiss. Defaults to 0.693.
 | |
|         threshold_language (float): threshold to find matches between 2 documents for translation. Defaults to 0.715.
 | |
|         threshold_semantic (float): threshold to find matches between 2 documents for rewording. Defaults to 0.85.
 | |
|     Return:
 | |
|         matches (pd.DataFrame): dataframe containing pairs of text detected as duplicate contents from 3 delta
 | |
|         df_cluster (pd.DataFrame): initial dataframe 'df' with its cluster of duplicated content associated if it exists.
 | |
|     """
 | |
| 
 | |
|     df = prepare_dataset(df, min_size_txt=min_size_txt)
 | |
| 
 | |
|     if "language" not in df.columns:
 | |
|         print("language detection")
 | |
|         df = compute_language(df)
 | |
| 
 | |
|     if df_embeddings_use is None:
 | |
|         df_embeddings_use = compute_embeddings(df)
 | |
|         if embeddings_to_save is not None:
 | |
|             df_embeddings_use.to_pickle(f"{embeddings_to_save}.pkl")
 | |
| 
 | |
|     index_faiss = create_index_cosine(df_embeddings_use)
 | |
| 
 | |
|     threshold_faiss = min(threshold_language, threshold_semantic)
 | |
|     res = find_matches(df_embeddings_use, index_faiss, threshold=threshold_faiss)
 | |
| 
 | |
|     if remove_matches_same_user is not None:
 | |
|         columns_join = [
 | |
|             remove_matches_same_user,
 | |
|             "language",
 | |
|             "text_to_embed",
 | |
|             "text_grapheme",
 | |
|         ]
 | |
|     else:
 | |
|         columns_join = ["language", "text_to_embed", "text_grapheme"]
 | |
|     matches = res.merge(
 | |
|         df[columns_join].add_suffix("_source"),
 | |
|         left_on="source",
 | |
|         right_index=True,
 | |
|         how="left",
 | |
|     ).merge(
 | |
|         df[columns_join].add_suffix("_target"),
 | |
|         left_on="target",
 | |
|         right_index=True,
 | |
|         how="left",
 | |
|     )
 | |
|     matches = compute_duplicate_types(
 | |
|         matches,
 | |
|         threshold_grapheme=threshold_grapheme,
 | |
|         threshold_language=threshold_language,
 | |
|         threshold_semantic=threshold_semantic,
 | |
|     )
 | |
| 
 | |
|     if remove_matches_same_user is not None:
 | |
|         matches = matches[
 | |
|             matches[remove_matches_same_user + "_source"]
 | |
|             != matches[remove_matches_same_user + "_target"]
 | |
|         ]
 | |
| 
 | |
|     df_clusters = create_dataset_clusters(df, matches)
 | |
| 
 | |
|     return matches, df_clusters
 | 
