Этот коммит содержится в:
MathisHammel 2025-04-16 23:58:49 +02:00 коммит произвёл Viginum-DataScientist-6
родитель 80b81896e8
Коммит e641fb8f50
2 изменённых файлов: 42 добавлений и 4 удалений

Просмотреть файл

@ -3,7 +3,6 @@ import os
import re
import time
from typing import Union
import demoji
import faiss
import fasttext
from gensim.utils import deaccent
@ -47,6 +46,46 @@ def grouper(iterable, n):
#### Preprocessing Dataset ####
###############################
# Unicode ranges for most emojis
SYMBOL_REGEX = re.compile(
"["
"\U00002000-\U0000206F" # General Punctuation
"\U00002190-\U000021FF" # Arrows
"\U00002300-\U000023FF" # Miscellaneous Technical
"\U00002400-\U0000243F" # Control Pictures
"\U00002440-\U0000245F" # Optical Character Recognition
"\U00002460-\U0000249F" # Enclosed Alphanumerics
"\U000024B0-\U000024FF" # Enclosed Alphanumerics Extension
"\U00002500-\U0000257F" # Box Drawing
"\U00002580-\U000025FF" # Block Elements
"\U00002600-\U000026FF" # Miscellaneous Symbols
"\U00002700-\U000027BF" # Dingbats
"\U000027C0-\U000027EF" # Miscellaneous Mathematical Symbols-A
"\U000027F0-\U000027FF" # Supplemental Arrows-A
"\U00002800-\U000028FF" # Braille Patterns
"\U00002900-\U0000297F" # Supplemental Arrows-B
"\U00002980-\U000029FF" # Miscellaneous Mathematical Symbols-B
"\U00002A00-\U00002AFF" # Supplemental Mathematical Operators
"\U00002B00-\U00002BFF" # Miscellaneous Symbols and Arrows
"\U00003200-\U0000325F" # Enclosed CJK Letters and Months
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F600-\U0001F64F" # emoticons
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F700-\U0001F77F" # alchemical symbols
"\U0001F780-\U0001F7FF" # Geometric Shapes
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
"\U0001FA00-\U0001FA6F" # Chess Symbols
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
"\U0001FB00-\U0001FBFF" # Symbols for Legacy Computing
"\U0000200D" # Zero Width Joiner (ZWJ)
"\U0000FE0F" # Variation Selector-16 (emoji style)
"\U0000FE0E" # Variation Selector-15 (text style)
"]+"
)
def remove_symbols(text):
return SYMBOL_REGEX.sub(r'', text)
def preprocess_text(
s,
@ -68,7 +107,7 @@ def preprocess_text(
remove_accents (bool, optional): deaccent the text. Defaults to True.
remove_urls (bool, optional): remove urls from the text. Defaults to True.
remove_mentions (bool, optional): remove mentions from the text. Defaults to True.
remove_emojis (bool, optional): remove emojis from the text. Defaults to True.
remove_emojis (bool, optional): remove emojis and other pictograms from the text. Defaults to True.
remove_hashtags_frontend (bool, optional): remove leading and ending hashtags from the text. Defaults to False.
remove_twitter_cropend (bool, optional): remove Twitter-added "" character at the end of messages that are too long. Defaults to False.
replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True.
@ -104,7 +143,7 @@ def preprocess_text(
for msg in s
]
if remove_emojis:
s = [demoji.replace(msg, "").strip() for msg in s]
s = [remove_symbols(msg).strip() for msg in s]
if remove_hashtags_frontend:
if (not remove_urls) or (not remove_mentions):

Просмотреть файл

@ -7,7 +7,6 @@ authors = ["Viginum"]
[tool.poetry.dependencies]
python = "^3.10"
demoji = "1.1.0"
faiss-cpu = "1.9.0.post1"
fasttext = "0.9.3"
gensim = "4.3.3"