Этот коммит содержится в:
Viginum-DataScientist-6 2025-05-26 11:52:38 +02:00
родитель c1d3767d57
Коммит 8c342ea5c1

Просмотреть файл

@ -30,7 +30,7 @@ def timeit(func):
if total_time < 60: if total_time < 60:
print(f"<<< End {func.__name__}, Took: {total_time:.4f} sec") print(f"<<< End {func.__name__}, Took: {total_time:.4f} sec")
else: else:
print(f"<<< End {func.__name__}, Took:{np.round((total_time)/60, 1)} min") print(f"<<< End {func.__name__}, Took:{np.round((total_time) / 60, 1)} min")
return result return result
return timeit_wrapper return timeit_wrapper
@ -184,22 +184,22 @@ def prepare_dataset(dataset: Union[pd.Series, pd.DataFrame], min_size_txt: int =
Returns: Returns:
dataset (pd.DataFrame): The same input dataset with new columns added (text_grapheme, text_to_embed, text_language_detect), containing the preprocessed texts for 3 delta method. dataset (pd.DataFrame): The same input dataset with new columns added (text_grapheme, text_to_embed, text_language_detect), containing the preprocessed texts for 3 delta method.
""" """
assert isinstance( assert isinstance(dataset, (pd.Series, pd.DataFrame)), (
dataset, (pd.Series, pd.DataFrame) "dataset must be a pd.Series or a pd.DataFrame"
), "dataset must be a pd.Series or a pd.DataFrame" )
assert dataset.index.nunique() == len( assert dataset.index.nunique() == len(dataset), (
dataset "dataset must be indexed with unique indices"
), "dataset must be indexed with unique indices" )
assert all( assert all([isinstance(i, str) for i in dataset.index]), (
[isinstance(i, str) for i in dataset.index] "dataset indices must be `str`"
), "dataset indices must be `str`" )
if isinstance(dataset, pd.DataFrame): if isinstance(dataset, pd.DataFrame):
assert ( assert "original" in dataset.columns, (
"original" in dataset.columns "when dataset is a pd.DataFrame, it must have a column named 'original'"
), "when dataset is a pd.DataFrame, it must have a column named 'original'" )
if isinstance(dataset, pd.Series): if isinstance(dataset, pd.Series):
dataset = dataset.to_frame("original") dataset = dataset.to_frame("original")
@ -260,7 +260,7 @@ def prepare_dataset(dataset: Union[pd.Series, pd.DataFrame], min_size_txt: int =
if min_size_txt is not None: if min_size_txt is not None:
print( print(
f'Removing {(dataset["text_grapheme"].str.len() < min_size_txt).sum()} short texts over {len(dataset)} sentences...' f"Removing {(dataset['text_grapheme'].str.len() < min_size_txt).sum()} short texts over {len(dataset)} sentences..."
) )
dataset = dataset.loc[dataset["text_grapheme"].str.len() >= min_size_txt] dataset = dataset.loc[dataset["text_grapheme"].str.len() >= min_size_txt]
print("Done.") print("Done.")
@ -285,9 +285,9 @@ def compute_language(
Returns: Returns:
dataset (pd.DataFrame): The same input dataset with column 'language' added containing the results of language detection. dataset (pd.DataFrame): The same input dataset with column 'language' added containing the results of language detection.
""" """
assert ( assert "text_language_detect" in dataset.columns, (
"text_language_detect" in dataset.columns "you need to have a column text_language_detect to detect language"
), "you need to have a column text_language_detect to detect language" )
if fasttext_model is None: if fasttext_model is None:
if os.path.exists("lid.176.ftz"): if os.path.exists("lid.176.ftz"):
@ -452,9 +452,9 @@ def find_matches(
def similarity_levenshtein(pair): def similarity_levenshtein(pair):
s1, s2 = pair s1, s2 = pair
assert ( assert min(len(s1), len(s2)) > 0, (
min(len(s1), len(s2)) > 0 "one text_grapheme is None and levenshtein can't be retrieved"
), "one text_grapheme is None and levenshtein can't be retrieved" )
return 1 - levenshtein(s1, s2) / max(len(s1), len(s2)) return 1 - levenshtein(s1, s2) / max(len(s1), len(s2))