зеркало из
				https://github.com/VIGINUM-FR/D3lta.git
				synced 2025-10-31 14:06:10 +02:00 
			
		
		
		
	refactor: extract emojis_remover with dependency injection
- enables testing and benchmarking different implementations
Этот коммит содержится в:
		
							родитель
							
								
									9d178deb74
								
							
						
					
					
						Коммит
						ad5e63da8b
					
				
							
								
								
									
										90
									
								
								d3lta/emojis_remover.py
									
									
									
									
									
										Обычный файл
									
								
							
							
						
						
									
										90
									
								
								d3lta/emojis_remover.py
									
									
									
									
									
										Обычный файл
									
								
							| @ -0,0 +1,90 @@ | |||||||
|  | import re | ||||||
|  | from abc import ABC, abstractmethod | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from typing import final | ||||||
|  | 
 | ||||||
|  | import demoji | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @dataclass | ||||||
|  | class EmojisRemover(ABC): | ||||||
|  |     skip_postprocessing: bool = False | ||||||
|  | 
 | ||||||
|  |     @final | ||||||
|  |     def remove_symbols(self, text: str) -> str: | ||||||
|  |         text_without_symbols = self._remove_symbols_implementation(text) | ||||||
|  |         if self.skip_postprocessing: | ||||||
|  |             return text_without_symbols | ||||||
|  | 
 | ||||||
|  |         return self._postprocess(text_without_symbols) | ||||||
|  | 
 | ||||||
|  |     def _postprocess(self, text: str) -> str: | ||||||
|  |         # text = self._remove_whitespace_before_newline(text) | ||||||
|  |         text_without_repeated_whitespace = self._remove_repeated_whitespace(text) | ||||||
|  |         stripped_text_without_repeated_whitespace = ( | ||||||
|  |             text_without_repeated_whitespace.strip() | ||||||
|  |         ) | ||||||
|  |         return stripped_text_without_repeated_whitespace | ||||||
|  | 
 | ||||||
|  |     @abstractmethod | ||||||
|  |     def _remove_symbols_implementation(self, text: str) -> str: ... | ||||||
|  | 
 | ||||||
|  |     _whitespace_or_newline_capturing_group_name = "whitespace_or_newline" | ||||||
|  |     _repeated_whitespace_pattern = re.compile( | ||||||
|  |         rf"[ ]+(?P<{_whitespace_or_newline_capturing_group_name}> |\n)" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     def _remove_repeated_whitespace(self, text: str) -> str: | ||||||
|  |         return re.sub( | ||||||
|  |             self._repeated_whitespace_pattern, | ||||||
|  |             rf"\g<{self._whitespace_or_newline_capturing_group_name}>", | ||||||
|  |             text, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class ExplicitUnicodeBlocksEmojisRemover(EmojisRemover): | ||||||
|  |     # Unicode ranges for most emojis | ||||||
|  |     SYMBOLS_REGEX = re.compile( | ||||||
|  |         "[" | ||||||
|  |         "\U00002000-\U0000206f"  # General Punctuation | ||||||
|  |         "\U00002190-\U000021ff"  # Arrows | ||||||
|  |         "\U00002300-\U000023ff"  # Miscellaneous Technical | ||||||
|  |         "\U00002400-\U0000243f"  # Control Pictures | ||||||
|  |         "\U00002440-\U0000245f"  # Optical Character Recognition | ||||||
|  |         "\U00002460-\U0000249f"  # Enclosed Alphanumerics | ||||||
|  |         "\U000024b0-\U000024ff"  # Enclosed Alphanumerics Extension | ||||||
|  |         "\U00002500-\U0000257f"  # Box Drawing | ||||||
|  |         "\U00002580-\U000025ff"  # Block Elements | ||||||
|  |         "\U00002600-\U000026ff"  # Miscellaneous Symbols | ||||||
|  |         "\U00002700-\U000027bf"  # Dingbats | ||||||
|  |         "\U000027c0-\U000027ef"  # Miscellaneous Mathematical Symbols-A | ||||||
|  |         "\U000027f0-\U000027ff"  # Supplemental Arrows-A | ||||||
|  |         "\U00002800-\U000028ff"  # Braille Patterns | ||||||
|  |         "\U00002900-\U0000297f"  # Supplemental Arrows-B | ||||||
|  |         "\U00002980-\U000029ff"  # Miscellaneous Mathematical Symbols-B | ||||||
|  |         "\U00002a00-\U00002aff"  # Supplemental Mathematical Operators | ||||||
|  |         "\U00002b00-\U00002bff"  # Miscellaneous Symbols and Arrows | ||||||
|  |         "\U00003200-\U0000325f"  # Enclosed CJK Letters and Months | ||||||
|  |         "\U0001f300-\U0001f5ff"  # symbols & pictographs | ||||||
|  |         "\U0001f600-\U0001f64f"  # emoticons | ||||||
|  |         "\U0001f680-\U0001f6ff"  # transport & map symbols | ||||||
|  |         "\U0001f700-\U0001f77f"  # alchemical symbols | ||||||
|  |         "\U0001f780-\U0001f7ff"  # Geometric Shapes | ||||||
|  |         "\U0001f800-\U0001f8ff"  # Supplemental Arrows-C | ||||||
|  |         "\U0001f900-\U0001f9ff"  # Supplemental Symbols and Pictographs | ||||||
|  |         "\U0001fa00-\U0001fa6f"  # Chess Symbols | ||||||
|  |         "\U0001fa70-\U0001faff"  # Symbols and Pictographs Extended-A | ||||||
|  |         "\U0001fb00-\U0001fbff"  # Symbols for Legacy Computing | ||||||
|  |         "\U0000200d"  # Zero Width Joiner (ZWJ) | ||||||
|  |         "\U0000fe0f"  # Variation Selector-16 (emoji style) | ||||||
|  |         "\U0000fe0e"  # Variation Selector-15 (text style) | ||||||
|  |         "]+" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     def _remove_symbols_implementation(self, text: str) -> str: | ||||||
|  |         return self.SYMBOLS_REGEX.sub(r"", text) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class DemojiEmojisRemover(EmojisRemover): | ||||||
|  |     def _remove_symbols_implementation(self, text: str) -> str: | ||||||
|  |         return demoji.replace(text) | ||||||
| @ -20,6 +20,8 @@ from polyleven import levenshtein | |||||||
| from tqdm.auto import trange | from tqdm.auto import trange | ||||||
| from tqdm.contrib.concurrent import thread_map | from tqdm.contrib.concurrent import thread_map | ||||||
| 
 | 
 | ||||||
|  | from d3lta.emojis_remover import EmojisRemover, ExplicitUnicodeBlocksEmojisRemover | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def timeit(func): | def timeit(func): | ||||||
|     @wraps(func) |     @wraps(func) | ||||||
| @ -48,46 +50,6 @@ def grouper(iterable, n): | |||||||
| #### Preprocessing Dataset #### | #### Preprocessing Dataset #### | ||||||
| ############################### | ############################### | ||||||
| 
 | 
 | ||||||
| # Unicode ranges for most emojis |  | ||||||
| SYMBOL_REGEX = re.compile( |  | ||||||
|     "[" |  | ||||||
|     "\U00002000-\U0000206F"  # General Punctuation |  | ||||||
|     "\U00002190-\U000021FF"  # Arrows |  | ||||||
|     "\U00002300-\U000023FF"  # Miscellaneous Technical |  | ||||||
|     "\U00002400-\U0000243F"  # Control Pictures |  | ||||||
|     "\U00002440-\U0000245F"  # Optical Character Recognition |  | ||||||
|     "\U00002460-\U0000249F"  # Enclosed Alphanumerics |  | ||||||
|     "\U000024B0-\U000024FF"  # Enclosed Alphanumerics Extension  |  | ||||||
|     "\U00002500-\U0000257F"  # Box Drawing |  | ||||||
|     "\U00002580-\U000025FF"  # Block Elements |  | ||||||
|     "\U00002600-\U000026FF"  # Miscellaneous Symbols |  | ||||||
|     "\U00002700-\U000027BF"  # Dingbats |  | ||||||
|     "\U000027C0-\U000027EF"  # Miscellaneous Mathematical Symbols-A |  | ||||||
|     "\U000027F0-\U000027FF"  # Supplemental Arrows-A |  | ||||||
|     "\U00002800-\U000028FF"  # Braille Patterns |  | ||||||
|     "\U00002900-\U0000297F"  # Supplemental Arrows-B |  | ||||||
|     "\U00002980-\U000029FF"  # Miscellaneous Mathematical Symbols-B |  | ||||||
|     "\U00002A00-\U00002AFF"  # Supplemental Mathematical Operators |  | ||||||
|     "\U00002B00-\U00002BFF"  # Miscellaneous Symbols and Arrows |  | ||||||
|     "\U00003200-\U0000325F"  # Enclosed CJK Letters and Months |  | ||||||
|     "\U0001F300-\U0001F5FF"  # symbols & pictographs |  | ||||||
|     "\U0001F600-\U0001F64F"  # emoticons |  | ||||||
|     "\U0001F680-\U0001F6FF"  # transport & map symbols |  | ||||||
|     "\U0001F700-\U0001F77F"  # alchemical symbols |  | ||||||
|     "\U0001F780-\U0001F7FF"  # Geometric Shapes |  | ||||||
|     "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C |  | ||||||
|     "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs |  | ||||||
|     "\U0001FA00-\U0001FA6F"  # Chess Symbols |  | ||||||
|     "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A |  | ||||||
|     "\U0001FB00-\U0001FBFF"  # Symbols for Legacy Computing |  | ||||||
|     "\U0000200D"              # Zero Width Joiner (ZWJ) |  | ||||||
|     "\U0000FE0F"              # Variation Selector-16 (emoji style) |  | ||||||
|     "\U0000FE0E"              # Variation Selector-15 (text style) |  | ||||||
|     "]+" |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| def remove_symbols(text): |  | ||||||
|     return SYMBOL_REGEX.sub(r'', text) |  | ||||||
| 
 | 
 | ||||||
| def preprocess_text( | def preprocess_text( | ||||||
|     s: str | list[str] | set[str] | frozenset[str] | pd.Series, |     s: str | list[str] | set[str] | frozenset[str] | pd.Series, | ||||||
| @ -100,6 +62,7 @@ def preprocess_text( | |||||||
|     remove_twitter_cropend: bool = False, |     remove_twitter_cropend: bool = False, | ||||||
|     replace_newline_characters: bool = True, |     replace_newline_characters: bool = True, | ||||||
|     remove_punctuation: bool = False, |     remove_punctuation: bool = False, | ||||||
|  |     emojis_remover: EmojisRemover | None = None, | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|     clean a list-like of strings, performing all the following treatments by default |     clean a list-like of strings, performing all the following treatments by default | ||||||
| @ -114,10 +77,16 @@ def preprocess_text( | |||||||
|         remove_twitter_cropend (bool, optional): remove Twitter-added "…" character at the end of messages that are too long. Defaults to False. |         remove_twitter_cropend (bool, optional): remove Twitter-added "…" character at the end of messages that are too long. Defaults to False. | ||||||
|         replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True. |         replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True. | ||||||
|         remove_punctuation (bool, optional): remove punctuation from the text, be careful, it will remove # of hashtags too. Defaults to False. |         remove_punctuation (bool, optional): remove punctuation from the text, be careful, it will remove # of hashtags too. Defaults to False. | ||||||
|  |         emojis_remover (EmojisRemover, optional): | ||||||
|  |             if provided, overrides the default engine used for emojis matching and removal. | ||||||
|  |             Has no effect if `remove_emojis` is set to False. | ||||||
|     """ |     """ | ||||||
|     if s is None: |     if s is None: | ||||||
|         s = "" |         s = "" | ||||||
| 
 | 
 | ||||||
|  |     if emojis_remover is None: | ||||||
|  |         emojis_remover = ExplicitUnicodeBlocksEmojisRemover() | ||||||
|  | 
 | ||||||
|     assert isinstance(s, (str, list, pd.Series, set, frozenset)) |     assert isinstance(s, (str, list, pd.Series, set, frozenset)) | ||||||
| 
 | 
 | ||||||
|     if isinstance(s, str): |     if isinstance(s, str): | ||||||
| @ -145,7 +114,7 @@ def preprocess_text( | |||||||
|             for msg in s |             for msg in s | ||||||
|         ] |         ] | ||||||
|     if remove_emojis: |     if remove_emojis: | ||||||
|         s = [remove_symbols(msg).strip() for msg in s] |         s = [emojis_remover.remove_symbols(msg).strip() for msg in s] | ||||||
| 
 | 
 | ||||||
|     if remove_hashtags_frontend: |     if remove_hashtags_frontend: | ||||||
|         if (not remove_urls) or (not remove_mentions): |         if (not remove_urls) or (not remove_mentions): | ||||||
|  | |||||||
		Загрузка…
	
	
			
			x
			
			
		
	
		Ссылка в новой задаче
	
	Block a user
	 Viginum-DataScientist-6
						Viginum-DataScientist-6