diff --git a/d3lta/faissd3lta.py b/d3lta/faissd3lta.py index b3f0f28..d044cac 100644 --- a/d3lta/faissd3lta.py +++ b/d3lta/faissd3lta.py @@ -90,16 +90,16 @@ def remove_symbols(text): return SYMBOL_REGEX.sub(r'', text) def preprocess_text( - s, - lower=True, - remove_accents=True, - remove_urls=True, - remove_mentions=True, - remove_emojis=True, - remove_hashtags_frontend=False, - remove_twitter_cropend=False, - replace_newline_characters=True, - remove_punctuation=False, + s: str | list[str] | set[str] | frozenset[str] | pd.Series, + lower: bool = True, + remove_accents: bool = True, + remove_urls: bool = True, + remove_mentions: bool = True, + remove_emojis: bool = True, + remove_hashtags_frontend: bool = False, + remove_twitter_cropend: bool = False, + replace_newline_characters: bool = True, + remove_punctuation: bool = False, ): """ clean a list-like of strings, performing all the following treatments by default diff --git a/tests/faissd3lta_test.py b/tests/faissd3lta_test.py index 4ef4e59..99f2f1c 100644 --- a/tests/faissd3lta_test.py +++ b/tests/faissd3lta_test.py @@ -24,13 +24,13 @@ def examples_dataset(): ] -def test_compute_language(examples_dataset): +def test_compute_language(examples_dataset: list[str]): df_language = pd.DataFrame(examples_dataset, columns=["text_language_detect"]) df_language = compute_language(df_language) assert list(df_language["language"]) == ["fr", "fr", "fr", "en", "en", "fr"] -def test_embedding_similarity(examples_dataset): +def test_embedding_similarity(examples_dataset: list[str]): df_test = pd.DataFrame( examples_dataset, columns=["text_to_embed"], @@ -52,7 +52,7 @@ def test_embedding_similarity(examples_dataset): ) -def test_semantic_faiss(examples_dataset): +def test_semantic_faiss(examples_dataset: list[str]): df = pd.DataFrame(examples_dataset, columns=["text_language_detect"]) df = compute_language(df) df_emb = compute_embeddings(