зеркало из
https://github.com/VIGINUM-FR/D3lta.git
synced 2025-10-29 05:04:20 +02:00
fix: add type annotations
Этот коммит содержится в:
родитель
b8b6e3940f
Коммит
9d178deb74
@ -90,16 +90,16 @@ def remove_symbols(text):
|
||||
return SYMBOL_REGEX.sub(r'', text)
|
||||
|
||||
def preprocess_text(
|
||||
s,
|
||||
lower=True,
|
||||
remove_accents=True,
|
||||
remove_urls=True,
|
||||
remove_mentions=True,
|
||||
remove_emojis=True,
|
||||
remove_hashtags_frontend=False,
|
||||
remove_twitter_cropend=False,
|
||||
replace_newline_characters=True,
|
||||
remove_punctuation=False,
|
||||
s: str | list[str] | set[str] | frozenset[str] | pd.Series,
|
||||
lower: bool = True,
|
||||
remove_accents: bool = True,
|
||||
remove_urls: bool = True,
|
||||
remove_mentions: bool = True,
|
||||
remove_emojis: bool = True,
|
||||
remove_hashtags_frontend: bool = False,
|
||||
remove_twitter_cropend: bool = False,
|
||||
replace_newline_characters: bool = True,
|
||||
remove_punctuation: bool = False,
|
||||
):
|
||||
"""
|
||||
clean a list-like of strings, performing all the following treatments by default
|
||||
|
||||
@ -24,13 +24,13 @@ def examples_dataset():
|
||||
]
|
||||
|
||||
|
||||
def test_compute_language(examples_dataset):
|
||||
def test_compute_language(examples_dataset: list[str]):
|
||||
df_language = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
|
||||
df_language = compute_language(df_language)
|
||||
assert list(df_language["language"]) == ["fr", "fr", "fr", "en", "en", "fr"]
|
||||
|
||||
|
||||
def test_embedding_similarity(examples_dataset):
|
||||
def test_embedding_similarity(examples_dataset: list[str]):
|
||||
df_test = pd.DataFrame(
|
||||
examples_dataset,
|
||||
columns=["text_to_embed"],
|
||||
@ -52,7 +52,7 @@ def test_embedding_similarity(examples_dataset):
|
||||
)
|
||||
|
||||
|
||||
def test_semantic_faiss(examples_dataset):
|
||||
def test_semantic_faiss(examples_dataset: list[str]):
|
||||
df = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
|
||||
df = compute_language(df)
|
||||
df_emb = compute_embeddings(
|
||||
|
||||
Загрузка…
x
Ссылка в новой задаче
Block a user