Этот коммит содержится в:
Viginum-DataScientist-6 2025-05-26 12:05:01 +02:00
родитель b8b6e3940f
Коммит 9d178deb74
2 изменённых файлов: 13 добавлений и 13 удалений

Просмотреть файл

@ -90,16 +90,16 @@ def remove_symbols(text):
return SYMBOL_REGEX.sub(r'', text)
def preprocess_text(
s,
lower=True,
remove_accents=True,
remove_urls=True,
remove_mentions=True,
remove_emojis=True,
remove_hashtags_frontend=False,
remove_twitter_cropend=False,
replace_newline_characters=True,
remove_punctuation=False,
s: str | list[str] | set[str] | frozenset[str] | pd.Series,
lower: bool = True,
remove_accents: bool = True,
remove_urls: bool = True,
remove_mentions: bool = True,
remove_emojis: bool = True,
remove_hashtags_frontend: bool = False,
remove_twitter_cropend: bool = False,
replace_newline_characters: bool = True,
remove_punctuation: bool = False,
):
"""
clean a list-like of strings, performing all the following treatments by default

Просмотреть файл

@ -24,13 +24,13 @@ def examples_dataset():
]
def test_compute_language(examples_dataset):
def test_compute_language(examples_dataset: list[str]):
df_language = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
df_language = compute_language(df_language)
assert list(df_language["language"]) == ["fr", "fr", "fr", "en", "en", "fr"]
def test_embedding_similarity(examples_dataset):
def test_embedding_similarity(examples_dataset: list[str]):
df_test = pd.DataFrame(
examples_dataset,
columns=["text_to_embed"],
@ -52,7 +52,7 @@ def test_embedding_similarity(examples_dataset):
)
def test_semantic_faiss(examples_dataset):
def test_semantic_faiss(examples_dataset: list[str]):
df = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
df = compute_language(df)
df_emb = compute_embeddings(