зеркало из
https://github.com/VIGINUM-FR/D3lta.git
synced 2025-10-29 21:16:20 +02:00
Added fast emoji support
Этот коммит содержится в:
родитель
80b81896e8
Коммит
e641fb8f50
@ -3,7 +3,6 @@ import os
|
|||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from typing import Union
|
from typing import Union
|
||||||
import demoji
|
|
||||||
import faiss
|
import faiss
|
||||||
import fasttext
|
import fasttext
|
||||||
from gensim.utils import deaccent
|
from gensim.utils import deaccent
|
||||||
@ -47,6 +46,46 @@ def grouper(iterable, n):
|
|||||||
#### Preprocessing Dataset ####
|
#### Preprocessing Dataset ####
|
||||||
###############################
|
###############################
|
||||||
|
|
||||||
|
# Unicode ranges for most emojis
|
||||||
|
SYMBOL_REGEX = re.compile(
|
||||||
|
"["
|
||||||
|
"\U00002000-\U0000206F" # General Punctuation
|
||||||
|
"\U00002190-\U000021FF" # Arrows
|
||||||
|
"\U00002300-\U000023FF" # Miscellaneous Technical
|
||||||
|
"\U00002400-\U0000243F" # Control Pictures
|
||||||
|
"\U00002440-\U0000245F" # Optical Character Recognition
|
||||||
|
"\U00002460-\U0000249F" # Enclosed Alphanumerics
|
||||||
|
"\U000024B0-\U000024FF" # Enclosed Alphanumerics Extension
|
||||||
|
"\U00002500-\U0000257F" # Box Drawing
|
||||||
|
"\U00002580-\U000025FF" # Block Elements
|
||||||
|
"\U00002600-\U000026FF" # Miscellaneous Symbols
|
||||||
|
"\U00002700-\U000027BF" # Dingbats
|
||||||
|
"\U000027C0-\U000027EF" # Miscellaneous Mathematical Symbols-A
|
||||||
|
"\U000027F0-\U000027FF" # Supplemental Arrows-A
|
||||||
|
"\U00002800-\U000028FF" # Braille Patterns
|
||||||
|
"\U00002900-\U0000297F" # Supplemental Arrows-B
|
||||||
|
"\U00002980-\U000029FF" # Miscellaneous Mathematical Symbols-B
|
||||||
|
"\U00002A00-\U00002AFF" # Supplemental Mathematical Operators
|
||||||
|
"\U00002B00-\U00002BFF" # Miscellaneous Symbols and Arrows
|
||||||
|
"\U00003200-\U0000325F" # Enclosed CJK Letters and Months
|
||||||
|
"\U0001F300-\U0001F5FF" # symbols & pictographs
|
||||||
|
"\U0001F600-\U0001F64F" # emoticons
|
||||||
|
"\U0001F680-\U0001F6FF" # transport & map symbols
|
||||||
|
"\U0001F700-\U0001F77F" # alchemical symbols
|
||||||
|
"\U0001F780-\U0001F7FF" # Geometric Shapes
|
||||||
|
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
|
||||||
|
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
|
||||||
|
"\U0001FA00-\U0001FA6F" # Chess Symbols
|
||||||
|
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
|
||||||
|
"\U0001FB00-\U0001FBFF" # Symbols for Legacy Computing
|
||||||
|
"\U0000200D" # Zero Width Joiner (ZWJ)
|
||||||
|
"\U0000FE0F" # Variation Selector-16 (emoji style)
|
||||||
|
"\U0000FE0E" # Variation Selector-15 (text style)
|
||||||
|
"]+"
|
||||||
|
)
|
||||||
|
|
||||||
|
def remove_symbols(text):
|
||||||
|
return SYMBOL_REGEX.sub(r'', text)
|
||||||
|
|
||||||
def preprocess_text(
|
def preprocess_text(
|
||||||
s,
|
s,
|
||||||
@ -68,7 +107,7 @@ def preprocess_text(
|
|||||||
remove_accents (bool, optional): deaccent the text. Defaults to True.
|
remove_accents (bool, optional): deaccent the text. Defaults to True.
|
||||||
remove_urls (bool, optional): remove urls from the text. Defaults to True.
|
remove_urls (bool, optional): remove urls from the text. Defaults to True.
|
||||||
remove_mentions (bool, optional): remove mentions from the text. Defaults to True.
|
remove_mentions (bool, optional): remove mentions from the text. Defaults to True.
|
||||||
remove_emojis (bool, optional): remove emojis from the text. Defaults to True.
|
remove_emojis (bool, optional): remove emojis and other pictograms from the text. Defaults to True.
|
||||||
remove_hashtags_frontend (bool, optional): remove leading and ending hashtags from the text. Defaults to False.
|
remove_hashtags_frontend (bool, optional): remove leading and ending hashtags from the text. Defaults to False.
|
||||||
remove_twitter_cropend (bool, optional): remove Twitter-added "…" character at the end of messages that are too long. Defaults to False.
|
remove_twitter_cropend (bool, optional): remove Twitter-added "…" character at the end of messages that are too long. Defaults to False.
|
||||||
replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True.
|
replace_newline_characters (bool, optional): replace two commonly found escape characters: \r and \n with '. '. Defaults to True.
|
||||||
@ -104,7 +143,7 @@ def preprocess_text(
|
|||||||
for msg in s
|
for msg in s
|
||||||
]
|
]
|
||||||
if remove_emojis:
|
if remove_emojis:
|
||||||
s = [demoji.replace(msg, "").strip() for msg in s]
|
s = [remove_symbols(msg).strip() for msg in s]
|
||||||
|
|
||||||
if remove_hashtags_frontend:
|
if remove_hashtags_frontend:
|
||||||
if (not remove_urls) or (not remove_mentions):
|
if (not remove_urls) or (not remove_mentions):
|
||||||
|
|||||||
@ -7,7 +7,6 @@ authors = ["Viginum"]
|
|||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.10"
|
python = "^3.10"
|
||||||
demoji = "1.1.0"
|
|
||||||
faiss-cpu = "1.9.0.post1"
|
faiss-cpu = "1.9.0.post1"
|
||||||
fasttext = "0.9.3"
|
fasttext = "0.9.3"
|
||||||
gensim = "4.3.3"
|
gensim = "4.3.3"
|
||||||
|
|||||||
Загрузка…
x
Ссылка в новой задаче
Block a user