зеркало из
https://github.com/VIGINUM-FR/D3lta.git
synced 2025-10-29 13:06:10 +02:00
586 строки
22 KiB
Python
586 строки
22 KiB
Python
import os
|
|
import re
|
|
import time
|
|
from functools import wraps
|
|
from typing import Union
|
|
|
|
import faiss
|
|
import fasttext
|
|
import networkx as nx
|
|
import numpy as np
|
|
import pandas as pd
|
|
import requests
|
|
import tensorflow as tf
|
|
import tensorflow_hub as hub
|
|
|
|
# import `tensorflow_text` ensures that some ops required by the USE model are available at runtime
|
|
import tensorflow_text # noqa: F401 # pylint: disable=unused-import
|
|
from gensim.utils import deaccent
|
|
from polyleven import levenshtein
|
|
from tqdm.auto import trange
|
|
from tqdm.contrib.concurrent import thread_map
|
|
|
|
from d3lta.emojis_remover import EmojisRemover, ExplicitUnicodeBlocksEmojisRemover
|
|
|
|
|
|
def timeit(func):
|
|
@wraps(func)
|
|
def timeit_wrapper(*args, **kwargs):
|
|
start_time = time.time()
|
|
print(f">>> Start {func.__name__}")
|
|
result = func(*args, **kwargs)
|
|
end_time = time.time()
|
|
total_time = end_time - start_time
|
|
if total_time < 60:
|
|
print(f"<<< End {func.__name__}, Took: {total_time:.4f} sec")
|
|
else:
|
|
print(f"<<< End {func.__name__}, Took:{np.round((total_time) / 60, 1)} min")
|
|
return result
|
|
|
|
return timeit_wrapper
|
|
|
|
|
|
def grouper(iterable, n):
|
|
"""A (lazy) iterator that chunks `iterable` into lists of `n`"""
|
|
for i in range(0, len(iterable), n):
|
|
yield iterable[i : i + n]
|
|
|
|
|
|
###############################
|
|
#### Preprocessing Dataset ####
|
|
###############################
|
|
|
|
|
|
def preprocess_text(
|
|
s: str | list[str] | set[str] | frozenset[str] | pd.Series,
|
|
lower: bool = True,
|
|
remove_accents: bool = True,
|
|
remove_urls: bool = True,
|
|
remove_mentions: bool = True,
|
|
remove_emojis: bool = True,
|
|
remove_hashtags_frontend: bool = False,
|
|
remove_twitter_cropend: bool = False,
|
|
replace_newline_characters: bool = True,
|
|
remove_punctuation: bool = False,
|
|
emojis_remover: EmojisRemover | None = None,
|
|
):
|
|
"""
|
|
clean a list-like of strings, performing all the following treatments by default
|
|
Args:
|
|
s (list-like of strings): input list-like of strings
|
|
lower (bool, optional): lowercase the text. Defaults to True.
|
|
remove_accents (bool, optional): deaccent the text. Defaults to True.
|
|
remove_urls (bool, optional): remove urls from the text. Defaults to True.
|
|
remove_mentions (bool, optional): remove mentions from the text. Defaults to True.
|
|
remove_emojis (bool, optional): remove emojis and other pictograms from the text. Defaults to True.
|
|
remove_hashtags_frontend (bool, optional): remove leading and ending hashtags from the text. Defaults to False.
|
|
remove_twitter_cropend (bool, optional): remove Twitter-added "…" character at the end of messages that are too long. Defaults to False.
|
|
replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True.
|
|
remove_punctuation (bool, optional): remove punctuation from the text, be careful, it will remove # of hashtags too. Defaults to False.
|
|
emojis_remover (EmojisRemover, optional):
|
|
if provided, overrides the default engine used for emojis matching and removal.
|
|
Has no effect if `remove_emojis` is set to False.
|
|
"""
|
|
if s is None:
|
|
s = ""
|
|
|
|
if emojis_remover is None:
|
|
emojis_remover = ExplicitUnicodeBlocksEmojisRemover()
|
|
|
|
assert isinstance(s, (str, list, pd.Series, set, frozenset))
|
|
|
|
if isinstance(s, str):
|
|
encapsulated = True
|
|
s = [s]
|
|
else:
|
|
encapsulated = False
|
|
if lower:
|
|
s = [msg.lower() for msg in s]
|
|
if remove_accents:
|
|
s = [deaccent(msg) for msg in s]
|
|
if remove_urls:
|
|
match_url_regexp = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
|
|
s = [re.sub(match_url_regexp, "", msg, flags=re.MULTILINE).strip() for msg in s]
|
|
if remove_mentions:
|
|
match_mentions_regexp = r"(@[a-zA-Z0-9_]+)"
|
|
s = [
|
|
re.sub(match_mentions_regexp, "", msg, flags=re.MULTILINE).strip()
|
|
for msg in s
|
|
]
|
|
if remove_twitter_cropend:
|
|
match_croppedmsg_regexp = r"([^\s]+…)$"
|
|
s = [
|
|
re.sub(match_croppedmsg_regexp, "", msg, flags=re.MULTILINE).strip()
|
|
for msg in s
|
|
]
|
|
if remove_emojis:
|
|
s = [emojis_remover.remove_symbols(msg).strip() for msg in s]
|
|
|
|
if remove_hashtags_frontend:
|
|
if (not remove_urls) or (not remove_mentions):
|
|
print(
|
|
"Not all leading and ending hashtags might be removed because there might be mentions or urls"
|
|
)
|
|
match_hashtags_begin = r"(#\S+ ?)+"
|
|
match_hashtags_end = r"(\S+# ?)+"
|
|
match_hashtags_frontend = f"^({match_hashtags_begin})|^({match_hashtags_end})|({match_hashtags_begin})$|({match_hashtags_end})$"
|
|
s = [re.sub(match_hashtags_frontend, "", msg).strip() for msg in s]
|
|
if replace_newline_characters:
|
|
match_escapes_regexp = r"(\n|\r)+"
|
|
s = [
|
|
re.sub(
|
|
r"\s+", " ", re.sub(match_escapes_regexp, ". ", msg, flags=re.MULTILINE)
|
|
).strip()
|
|
for msg in s
|
|
]
|
|
if remove_punctuation:
|
|
match_punctuations = r"[^\w\s]"
|
|
s = [
|
|
re.sub(r"\s+", " ", re.sub(match_punctuations, " ", msg)).strip()
|
|
for msg in s
|
|
]
|
|
if encapsulated:
|
|
return s[0].strip()
|
|
else:
|
|
return s
|
|
|
|
|
|
@timeit
|
|
def prepare_dataset(dataset: Union[pd.Series, pd.DataFrame], min_size_txt: int = 30):
|
|
"""
|
|
Create new columns of preprocessed texts from original text for distance comparison with 3 delta method
|
|
Args:
|
|
dataset (Union[pd.Series, pd.DataFrame]): dataframe or series containing a column "original" with the text. Optional: a column "language" can be given, otherwise language detection is implemented.
|
|
min_size_txt (Optional[int], optional): size of text that should'nt taken into account for duplicate content because too small. If set to None, no text is removed. Defaults to {default_min_size}.
|
|
Returns:
|
|
dataset (pd.DataFrame): The same input dataset with new columns added (text_grapheme, text_to_embed, text_language_detect), containing the preprocessed texts for 3 delta method.
|
|
"""
|
|
assert isinstance(dataset, (pd.Series, pd.DataFrame)), (
|
|
"dataset must be a pd.Series or a pd.DataFrame"
|
|
)
|
|
|
|
assert dataset.index.nunique() == len(dataset), (
|
|
"dataset must be indexed with unique indices"
|
|
)
|
|
|
|
assert all([isinstance(i, str) for i in dataset.index]), (
|
|
"dataset indices must be `str`"
|
|
)
|
|
|
|
if isinstance(dataset, pd.DataFrame):
|
|
assert "original" in dataset.columns, (
|
|
"when dataset is a pd.DataFrame, it must have a column named 'original'"
|
|
)
|
|
|
|
if isinstance(dataset, pd.Series):
|
|
dataset = dataset.to_frame("original")
|
|
|
|
# text_grapheme is used for grapheme distance (Levenshtein)
|
|
# this is the cleanest version with no spaces
|
|
if "text_grapheme" not in dataset.columns:
|
|
dataset["text_grapheme"] = [
|
|
t.replace(" ", "")
|
|
for t in preprocess_text(
|
|
dataset["original"],
|
|
lower=True,
|
|
remove_accents=True,
|
|
remove_urls=True,
|
|
remove_mentions=True,
|
|
remove_emojis=True,
|
|
remove_hashtags_frontend=True,
|
|
remove_twitter_cropend=False,
|
|
replace_newline_characters=True,
|
|
remove_punctuation=True,
|
|
)
|
|
]
|
|
|
|
# text_to_embed is used for semantic distance and embedded with USE
|
|
# links are removed
|
|
if "text_to_embed" not in dataset.columns:
|
|
dataset["text_to_embed"] = preprocess_text(
|
|
dataset["original"],
|
|
lower=False,
|
|
remove_accents=False,
|
|
remove_urls=True,
|
|
remove_mentions=True,
|
|
remove_emojis=False,
|
|
remove_hashtags_frontend=False,
|
|
remove_twitter_cropend=False,
|
|
replace_newline_characters=False,
|
|
remove_punctuation=False,
|
|
)
|
|
# text_language_detect is used for fasttext
|
|
# accents and emojis are kept as they provide interesting cues to language
|
|
if ("language" not in dataset.columns) or (
|
|
"text_language_detect" not in dataset.columns
|
|
):
|
|
dataset["text_language_detect"] = preprocess_text(
|
|
dataset["original"],
|
|
lower=False,
|
|
remove_accents=False,
|
|
remove_urls=True,
|
|
remove_mentions=True,
|
|
remove_emojis=True,
|
|
remove_hashtags_frontend=True,
|
|
remove_twitter_cropend=False,
|
|
replace_newline_characters=True,
|
|
remove_punctuation=False,
|
|
)
|
|
print("Done.")
|
|
print("")
|
|
|
|
if min_size_txt is not None:
|
|
print(
|
|
f"Removing {(dataset['text_grapheme'].str.len() < min_size_txt).sum()} short texts over {len(dataset)} sentences..."
|
|
)
|
|
dataset = dataset.loc[dataset["text_grapheme"].str.len() >= min_size_txt]
|
|
print("Done.")
|
|
|
|
return dataset
|
|
|
|
|
|
@timeit
|
|
def compute_language(
|
|
dataset: pd.DataFrame,
|
|
fasttext_model=None,
|
|
batch_size: int = 100,
|
|
max_workers: int = 8,
|
|
):
|
|
"""
|
|
Compute language detection in order to detect translation
|
|
Args :
|
|
dataset (pd.DataFrame): dataframe containing the column "text_language_detect" with the text to be analyzed
|
|
fasttext_model (Optional[any], optional): optional, if another model than fasttext is to be used, otherwise, fasttext is uploaded. Defaults to None.
|
|
batch_size (int, optional): batch size of text to be retrieved each step by parallelization. Defaults to 100.
|
|
max_workers (int, optional): number of workers for parallelization. Defaults to 8.
|
|
Returns:
|
|
dataset (pd.DataFrame): The same input dataset with column 'language' added containing the results of language detection.
|
|
"""
|
|
assert "text_language_detect" in dataset.columns, (
|
|
"you need to have a column text_language_detect to detect language"
|
|
)
|
|
|
|
if fasttext_model is None:
|
|
if os.path.exists("lid.176.ftz"):
|
|
print("Loading fastext model from local file...")
|
|
fasttext_model = fasttext.load_model("lid.176.ftz")
|
|
else:
|
|
print("Downloading fastext model from website and saving locally...")
|
|
r = requests.get(
|
|
"https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
|
|
)
|
|
with open("lid.176.ftz", "wb") as f:
|
|
f.write(r.content)
|
|
fasttext_model = fasttext.load_model("lid.176.ftz")
|
|
print("Done.\n")
|
|
|
|
def process_chunk_fasttext(text_chunk, threshold=0.5):
|
|
preds = fasttext_model.predict(text_chunk.tolist(), k=1)
|
|
preds = [
|
|
lng[0][-2:] if score[0] > threshold else ""
|
|
for lng, score in zip(preds[0], preds[1])
|
|
]
|
|
return preds
|
|
|
|
batch_size = batch_size
|
|
chunk_fasttext = thread_map(
|
|
process_chunk_fasttext,
|
|
grouper(dataset["text_language_detect"], batch_size),
|
|
max_workers=max_workers,
|
|
total=len(dataset) // batch_size,
|
|
)
|
|
|
|
dataset["language"] = np.concatenate(chunk_fasttext)
|
|
return dataset
|
|
|
|
|
|
#############################
|
|
#### Compute Embeddings ####
|
|
#############################
|
|
|
|
|
|
def download_USE(
|
|
use_url="https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3",
|
|
):
|
|
use_model = hub.load(use_url)
|
|
tf.saved_model.save(use_model, "use_model_kaggle")
|
|
return use_model
|
|
|
|
|
|
@timeit
|
|
def compute_embeddings(df, batch_size: int = 100, max_workers: int = 8):
|
|
"""
|
|
Compute embeddings for distance comparison
|
|
Args:
|
|
df (pd.DataFrame): dataframe containing the column "text_to_embed" with the text to be embedded
|
|
batch_size (int, optional): batch size of text to be retrieved each step by parallelization. Defaults to 100.
|
|
max_workers (int, optional): number of workers for parallelization. Defaults to 8.
|
|
Returns:
|
|
dataset (pd.DataFrame): A dataset with new columns added containing the results of embeddings computation.
|
|
"""
|
|
assert "text_to_embed" in df.columns, print(
|
|
"You need to compute text_to_embed columns"
|
|
)
|
|
use_model = download_USE()
|
|
|
|
def process_chunk_use(text_chunk):
|
|
return pd.DataFrame(
|
|
use_model(text_chunk).numpy(),
|
|
index=text_chunk.index,
|
|
columns=[f"USE:{i}" for i in range(512)],
|
|
)
|
|
|
|
batch_size = batch_size
|
|
chunk_use = thread_map(
|
|
process_chunk_use,
|
|
grouper(df["text_to_embed"], batch_size),
|
|
max_workers=max_workers,
|
|
total=len(df) // batch_size,
|
|
)
|
|
dataset = pd.concat([pd.concat(chunk_use, axis=0)], axis=1)
|
|
dataset.index = df.index
|
|
return dataset
|
|
|
|
|
|
@timeit
|
|
def create_index_cosine(df_embeddings: pd.DataFrame):
|
|
""" "
|
|
Create index with faiss for faster cosine distance computation
|
|
Args:
|
|
df_embeddings (pd.DataFrame): dataframe containing the embeddings
|
|
Returns:
|
|
index: A faiss index which can be used to compute cosine distances more efficiently
|
|
"""
|
|
embeddings = df_embeddings.to_numpy()
|
|
ids = list(df_embeddings.index)
|
|
|
|
# cosine similarity index...
|
|
vector_dimension = embeddings.shape[1]
|
|
index_flat = faiss.IndexFlat(vector_dimension, faiss.METRIC_INNER_PRODUCT)
|
|
# ...encapsulated in another index in order to have posts ids
|
|
index = faiss.IndexIDMap(index_flat)
|
|
|
|
# for cosine similarity, need of normalisation
|
|
try:
|
|
faiss.normalize_L2(embeddings)
|
|
except:
|
|
embeddings = embeddings.copy(order="C")
|
|
faiss.normalize_L2(embeddings)
|
|
print("C contiguous problem solved")
|
|
|
|
# add embeddings & ids
|
|
index.add_with_ids(embeddings, ids)
|
|
return index
|
|
|
|
|
|
@timeit
|
|
def find_matches(
|
|
df_embeddings_search: pd.DataFrame,
|
|
index,
|
|
threshold: float = 0.7,
|
|
batch_size: int = 100,
|
|
verbose=True,
|
|
):
|
|
"""
|
|
Compute pairwise cosine similarity between all docs in index between a subset of docs and all docs in index
|
|
Args :
|
|
df_embeddings_search (pd.DataFrame): dataframe containing embeddings we want to find similarity with in the faiss index
|
|
index: faiss index
|
|
threshold (float, optional): threshold for similarity. Defaults to 0.7.
|
|
batch_size (int, optional): number of vector per batch. Defaults to 100.
|
|
Returns :
|
|
matches (pd.DataFrame): A dataframe of pairs of duplicated texts with cosine score associated.
|
|
"""
|
|
list_indices = []
|
|
for i_batch in trange(
|
|
0, len(df_embeddings_search), batch_size, disable=not verbose
|
|
):
|
|
limits, distances, indices = index.range_search(
|
|
df_embeddings_search.iloc[i_batch : i_batch + batch_size].to_numpy(),
|
|
thresh=threshold,
|
|
)
|
|
for lim in range(len(limits) - 1):
|
|
source = df_embeddings_search.index[i_batch + lim]
|
|
for target, score in zip(
|
|
indices[limits[lim] : limits[lim + 1]],
|
|
distances[limits[lim] : limits[lim + 1]],
|
|
):
|
|
if str(target) != str(source): # doesn't match with its own embedding
|
|
list_indices.append([str(source), str(target), score])
|
|
|
|
# create matches dataframe
|
|
matches = pd.DataFrame(list_indices, columns=["source", "target", "score"])
|
|
# drop duplicates because we have A-B and B-A
|
|
matches["duplicates"] = matches.apply(
|
|
lambda row: str(min(row["source"], row["target"]))
|
|
+ "-"
|
|
+ str(max(row["source"], row["target"])),
|
|
axis=1,
|
|
)
|
|
matches = matches.drop_duplicates("duplicates")
|
|
return matches
|
|
|
|
|
|
def similarity_levenshtein(pair):
|
|
s1, s2 = pair
|
|
assert min(len(s1), len(s2)) > 0, (
|
|
"one text_grapheme is None and levenshtein can't be retrieved"
|
|
)
|
|
return 1 - levenshtein(s1, s2) / max(len(s1), len(s2))
|
|
|
|
|
|
@timeit
|
|
def compute_duplicate_types(
|
|
matches: pd.DataFrame,
|
|
threshold_grapheme=0.693,
|
|
threshold_language=0.715,
|
|
threshold_semantic=0.85,
|
|
):
|
|
"""
|
|
Distinguish 3 different duplicate types: translation, rewording & copypasta
|
|
Args :
|
|
matches (pd.DataFrame): dataframe of pairs of texts containing text_grapheme_source and text_grapheme_target columns for detecting copypasta and language_source and language_target for detecting translation
|
|
threshold_grapheme (float, optional): threshold to distinguish copypasta from rewording using levenshtein. Defaults to 0.693.
|
|
threshold_language (float, optional): threshold to detect translation. Defaults to 0.715.
|
|
threshold_semantic (float, optional): threshold to detect rewording. Defaults to 0.85.
|
|
Returns :
|
|
matches_strict (pd.DataFrame): dataframe containing 'copypasta', 'translation' and 'rewording' pairs of texts with score (cosine similarity from embeddings) and score_lev (score calculated from levenshtein) associated.
|
|
"""
|
|
assert ("text_grapheme_source" in matches.columns) & (
|
|
"text_grapheme_target" in matches.columns
|
|
), print(
|
|
"You need text_grapheme_source and text_grapheme_target columns in dataframe for Levenstein"
|
|
)
|
|
|
|
assert ("language_source" in matches.columns) & (
|
|
"language_target" in matches.columns
|
|
), print(
|
|
"You need language_source and language_target columns in dataframe for Levenstein"
|
|
)
|
|
|
|
matches["dup_type"] = "rewording"
|
|
matches.loc[
|
|
matches["language_source"] != matches["language_target"], "dup_type"
|
|
] = "translation"
|
|
|
|
matches.loc[matches.dup_type == "rewording", "score_lev"] = matches.loc[
|
|
matches.dup_type == "rewording"
|
|
].apply(
|
|
lambda x: similarity_levenshtein(
|
|
(x["text_grapheme_source"], x["text_grapheme_target"])
|
|
),
|
|
axis=1,
|
|
)
|
|
matches.loc[matches.score_lev > threshold_grapheme, "dup_type"] = "copy-pasta"
|
|
|
|
matches_strict = matches[
|
|
((matches.score > threshold_semantic) & (matches.dup_type == "rewording"))
|
|
| ((matches.score > threshold_language) & (matches.dup_type == "translation"))
|
|
| (matches.dup_type == "copy-pasta")
|
|
]
|
|
|
|
return matches_strict
|
|
|
|
|
|
def create_dataset_clusters(dataset: pd.DataFrame, edgelist: pd.DataFrame):
|
|
"""Give a cluster of duplicated content to all documents.
|
|
|
|
None if no duplicated content was found for a document
|
|
Args:
|
|
dataset (pd.DataFrame): dataframe containing each document and same index used to create embeddings and faiss index.
|
|
edgelist (pd.DataFrame): dataframe corresponding to pairs of texts and score associated
|
|
Return:
|
|
df_cluster (pd.DataFrame): dataframe with one row corresponding to one text and its cluster of duplicated content associated if it exists.
|
|
"""
|
|
df_cluster = dataset.copy()
|
|
consolidated_edgelist = edgelist.groupby(["source", "target"], as_index=False)[
|
|
"score"
|
|
].max()
|
|
clusters = list(
|
|
nx.connected_components(nx.from_pandas_edgelist(consolidated_edgelist))
|
|
)
|
|
clusters.sort(key=len, reverse=True)
|
|
for cluster_i, posts_indices in enumerate(clusters):
|
|
df_cluster.loc[list(posts_indices), "cluster"] = cluster_i
|
|
return df_cluster
|
|
|
|
|
|
def semantic_faiss(
|
|
df: pd.DataFrame,
|
|
min_size_txt: int = 30,
|
|
df_embeddings_use: pd.DataFrame = None,
|
|
embeddings_to_save: str = None,
|
|
threshold_grapheme: float = 0.693,
|
|
threshold_language: float = 0.715,
|
|
threshold_semantic=0.85,
|
|
remove_matches_same_user: str = None,
|
|
):
|
|
"""Apply end to end 3 delta methodology with faiss
|
|
Args:
|
|
df (pd.DataFrame): dataframe containing some columns :
|
|
- original: text original
|
|
- language (optional): language of each text. If not given, language detection is computed in order to detect translation
|
|
min_size_txt (int): minimal size of text in order to apply 3 delta. if texts too short, removing document.
|
|
df_embeddings_use (pd.DataFrame): embeddings dataframe already saved in order not to compute embeddings everytime.
|
|
embeddings_to_save (str): name of pickle to save the embeddings if the user wants to save the embeddings.
|
|
threshold_grapheme (float): threshold to detect copypasta with levenshtein on matches found with faiss. Defaults to 0.693.
|
|
threshold_language (float): threshold to find matches between 2 documents for translation. Defaults to 0.715.
|
|
threshold_semantic (float): threshold to find matches between 2 documents for rewording. Defaults to 0.85.
|
|
Return:
|
|
matches (pd.DataFrame): dataframe containing pairs of text detected as duplicate contents from 3 delta
|
|
df_cluster (pd.DataFrame): initial dataframe 'df' with its cluster of duplicated content associated if it exists.
|
|
"""
|
|
|
|
df = prepare_dataset(df, min_size_txt=min_size_txt)
|
|
|
|
if "language" not in df.columns:
|
|
print("language detection")
|
|
df = compute_language(df)
|
|
|
|
if df_embeddings_use is None:
|
|
df_embeddings_use = compute_embeddings(df)
|
|
if embeddings_to_save is not None:
|
|
df_embeddings_use.to_pickle(f"{embeddings_to_save}.pkl")
|
|
|
|
index_faiss = create_index_cosine(df_embeddings_use)
|
|
|
|
threshold_faiss = min(threshold_language, threshold_semantic)
|
|
res = find_matches(df_embeddings_use, index_faiss, threshold=threshold_faiss)
|
|
|
|
if remove_matches_same_user is not None:
|
|
columns_join = [
|
|
remove_matches_same_user,
|
|
"language",
|
|
"text_to_embed",
|
|
"text_grapheme",
|
|
]
|
|
else:
|
|
columns_join = ["language", "text_to_embed", "text_grapheme"]
|
|
matches = res.merge(
|
|
df[columns_join].add_suffix("_source"),
|
|
left_on="source",
|
|
right_index=True,
|
|
how="left",
|
|
).merge(
|
|
df[columns_join].add_suffix("_target"),
|
|
left_on="target",
|
|
right_index=True,
|
|
how="left",
|
|
)
|
|
matches = compute_duplicate_types(
|
|
matches,
|
|
threshold_grapheme=threshold_grapheme,
|
|
threshold_language=threshold_language,
|
|
threshold_semantic=threshold_semantic,
|
|
)
|
|
|
|
if remove_matches_same_user is not None:
|
|
matches = matches[
|
|
matches[remove_matches_same_user + "_source"]
|
|
!= matches[remove_matches_same_user + "_target"]
|
|
]
|
|
|
|
df_clusters = create_dataset_clusters(df, matches)
|
|
|
|
return matches, df_clusters
|