fix: add type annotations

2025-10-29 05:04:20 +02:00 · 2025-05-26 12:05:01 +02:00 · 2025-05-26 12:05:01 +02:00 · 9d178deb74
--- a/d3lta/faissd3lta.py
+++ b/d3lta/faissd3lta.py
@ -90,16 +90,16 @@ def remove_symbols(text):
    return SYMBOL_REGEX.sub(r'', text)

 def preprocess_text(
-    s,
-    lower=True,
-    remove_accents=True,
-    remove_urls=True,
-    remove_mentions=True,
-    remove_emojis=True,
-    remove_hashtags_frontend=False,
-    remove_twitter_cropend=False,
-    replace_newline_characters=True,
-    remove_punctuation=False,
+    s: str | list[str] | set[str] | frozenset[str] | pd.Series,
+    lower: bool = True,
+    remove_accents: bool = True,
+    remove_urls: bool = True,
+    remove_mentions: bool = True,
+    remove_emojis: bool = True,
+    remove_hashtags_frontend: bool = False,
+    remove_twitter_cropend: bool = False,
+    replace_newline_characters: bool = True,
+    remove_punctuation: bool = False,
 ):
    """
    clean a list-like of strings, performing all the following treatments by default
--- a/tests/faissd3lta_test.py
+++ b/tests/faissd3lta_test.py
@ -24,13 +24,13 @@ def examples_dataset():
    ]


-def test_compute_language(examples_dataset):
+def test_compute_language(examples_dataset: list[str]):
    df_language = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
    df_language = compute_language(df_language)
    assert list(df_language["language"]) == ["fr", "fr", "fr", "en", "en", "fr"]


-def test_embedding_similarity(examples_dataset):
+def test_embedding_similarity(examples_dataset: list[str]):
    df_test = pd.DataFrame(
        examples_dataset,
        columns=["text_to_embed"],
@ -52,7 +52,7 @@ def test_embedding_similarity(examples_dataset):
    )


-def test_semantic_faiss(examples_dataset):
+def test_semantic_faiss(examples_dataset: list[str]):
    df = pd.DataFrame(examples_dataset, columns=["text_language_detect"])
    df = compute_language(df)
    df_emb = compute_embeddings(