зеркало из
https://github.com/ssciwr/AMMICO.git
synced 2025-10-30 13:36:04 +02:00
Topic analysis (#53)
* add bertopic to requirements * basic topic modeling * topic modeling using BERT; bugfix if no text on post * update for google colab * Catch connection errors * replace newline character with space * move topic analysis into PostprocessText class * set up dataflow topic analysis * expose topic model to UI * tests for class init * tests for topic analysis * more tests * take care of carriage return on windows * take care of carriage return on windows * take care of carriage return on windows * set encoding to ensure windows compatibility * track encoding error * more debug * skip topic analysis debug * windows fixes
Этот коммит содержится в:
родитель
0a017b10ec
Коммит
a6578cfdf3
52
misinformation/test/data/test_data_out.csv
Обычный файл
52
misinformation/test/data/test_data_out.csv
Обычный файл
@ -0,0 +1,52 @@
|
|||||||
|
,filename,text,text_language,text_english
|
||||||
|
0,./test/data/IMG_3755.jpg,,,"Mathematische Formelsammlung
|
||||||
|
für Ingenieure und Naturwissenschaftler
|
||||||
|
Mit zahlreichen Abbildungen und Rechenbeispielen
|
||||||
|
und einer ausführlichen Integraltafel
|
||||||
|
3., verbesserte Auflage"
|
||||||
|
1,./test/data/IMG_3756.jpg,"SCATTERING THEORY
|
||||||
|
The Quantum Theory of
|
||||||
|
Nonrelativistic Collisions
|
||||||
|
JOHN R. TAYLOR
|
||||||
|
University of Colorado
|
||||||
|
ostaliga Lanbidean
|
||||||
|
1 ilde
|
||||||
|
ballenger stor goin
|
||||||
|
gdĐOL, SIVI 23 TL 02
|
||||||
|
de in obl
|
||||||
|
och yd badalang
|
||||||
|
a
|
||||||
|
Ber
|
||||||
|
ook Sy-RW enot go baldus",om,"SCATTERING THEORY
|
||||||
|
The Quantum Theory of
|
||||||
|
Nonrelativistic Collisions
|
||||||
|
JOHN R. TAYLOR
|
||||||
|
University of Colorado
|
||||||
|
ostaliga Lanbidean
|
||||||
|
1 ilde
|
||||||
|
balloons big goin
|
||||||
|
gdĐOL, SIVI 23 TL
|
||||||
|
there in obl
|
||||||
|
och yd change
|
||||||
|
a
|
||||||
|
Ber
|
||||||
|
ook Sy-RW isn't going anywhere"
|
||||||
|
2,./test/data/IMG_3757.jpg,"THE
|
||||||
|
ALGEBRAIC
|
||||||
|
EIGENVALUE
|
||||||
|
PROBLEM
|
||||||
|
DOM
|
||||||
|
NVS TIO
|
||||||
|
MINA
|
||||||
|
Monographs
|
||||||
|
on Numerical Analysis
|
||||||
|
J.. H. WILKINSON",en,"THE
|
||||||
|
ALGEBRAIC
|
||||||
|
EIGENVALUE
|
||||||
|
PROBLEM
|
||||||
|
DOM
|
||||||
|
NVS TIO
|
||||||
|
MINA
|
||||||
|
Monographs
|
||||||
|
on Numerical Analysis
|
||||||
|
J.. H. WILKINSON"
|
||||||
|
52
misinformation/test/data/test_data_out_nokey.csv
Обычный файл
52
misinformation/test/data/test_data_out_nokey.csv
Обычный файл
@ -0,0 +1,52 @@
|
|||||||
|
,filename,text,text_language,text_nglish
|
||||||
|
0,./test/data/IMG_3755.jpg,,,"Mathematische Formelsammlung
|
||||||
|
für Ingenieure und Naturwissenschaftler
|
||||||
|
Mit zahlreichen Abbildungen und Rechenbeispielen
|
||||||
|
und einer ausführlichen Integraltafel
|
||||||
|
3., verbesserte Auflage"
|
||||||
|
1,./test/data/IMG_3756.jpg,"SCATTERING THEORY
|
||||||
|
The Quantum Theory of
|
||||||
|
Nonrelativistic Collisions
|
||||||
|
JOHN R. TAYLOR
|
||||||
|
University of Colorado
|
||||||
|
ostaliga Lanbidean
|
||||||
|
1 ilde
|
||||||
|
ballenger stor goin
|
||||||
|
gdĐOL, SIVI 23 TL 02
|
||||||
|
de in obl
|
||||||
|
och yd badalang
|
||||||
|
a
|
||||||
|
Ber
|
||||||
|
ook Sy-RW enot go baldus",om,"SCATTERING THEORY
|
||||||
|
The Quantum Theory of
|
||||||
|
Nonrelativistic Collisions
|
||||||
|
JOHN R. TAYLOR
|
||||||
|
University of Colorado
|
||||||
|
ostaliga Lanbidean
|
||||||
|
1 ilde
|
||||||
|
balloons big goin
|
||||||
|
gdĐOL, SIVI 23 TL
|
||||||
|
there in obl
|
||||||
|
och yd change
|
||||||
|
a
|
||||||
|
Ber
|
||||||
|
ook Sy-RW isn't going anywhere"
|
||||||
|
2,./test/data/IMG_3757.jpg,"THE
|
||||||
|
ALGEBRAIC
|
||||||
|
EIGENVALUE
|
||||||
|
PROBLEM
|
||||||
|
DOM
|
||||||
|
NVS TIO
|
||||||
|
MINA
|
||||||
|
Monographs
|
||||||
|
on Numerical Analysis
|
||||||
|
J.. H. WILKINSON",en,"THE
|
||||||
|
ALGEBRAIC
|
||||||
|
EIGENVALUE
|
||||||
|
PROBLEM
|
||||||
|
DOM
|
||||||
|
NVS TIO
|
||||||
|
MINA
|
||||||
|
Monographs
|
||||||
|
on Numerical Analysis
|
||||||
|
J.. H. WILKINSON"
|
||||||
|
190
misinformation/test/data/topic_analysis_test.csv
Обычный файл
190
misinformation/test/data/topic_analysis_test.csv
Обычный файл
@ -0,0 +1,190 @@
|
|||||||
|
text_english
|
||||||
|
Mercury: Retrograde
|
||||||
|
Pathology
|
||||||
|
Symbiote
|
||||||
|
ProductOfDrugs (Prod. The Virus and Antidote)
|
||||||
|
Venom
|
||||||
|
Gatteka
|
||||||
|
kamikaze (+ pulse)
|
||||||
|
T.R.U. (Totally Rotten Underground)
|
||||||
|
I Put My Dick in Your Mental
|
||||||
|
Andromeda
|
||||||
|
BRAINFOOD
|
||||||
|
Troll Under the Bridge
|
||||||
|
1000 Rounds
|
||||||
|
Sacrifice
|
||||||
|
Backpack
|
||||||
|
D(R)Own
|
||||||
|
"Okay
|
||||||
|
TakingOutTheTrash
|
||||||
|
Io sono qui
|
||||||
|
Paris
|
||||||
|
Murder
|
||||||
|
High 'N Mighty
|
||||||
|
Euronymous
|
||||||
|
Hades
|
||||||
|
Nails
|
||||||
|
Squeeze
|
||||||
|
No Teeth
|
||||||
|
Bang Ya Fucking Head
|
||||||
|
BLUE JUICE
|
||||||
|
Loch Ness
|
||||||
|
Hold Uh
|
||||||
|
Bone Saw
|
||||||
|
Coffin Wave
|
||||||
|
OhNo!
|
||||||
|
TheArtOfCremation
|
||||||
|
OakGroveRoad
|
||||||
|
WhatWasThat
|
||||||
|
FunnyToSeeYouHere
|
||||||
|
John Dee
|
||||||
|
Kybalion
|
||||||
|
Killer
|
||||||
|
608
|
||||||
|
Eternal Dreams
|
||||||
|
Nightmare Choir (I Been Asleep Too Long)
|
||||||
|
Exodus
|
||||||
|
Vengeance
|
||||||
|
Claustrophobia
|
||||||
|
Rearranged
|
||||||
|
Paralax
|
||||||
|
Exsanguination
|
||||||
|
Mutiny
|
||||||
|
Centipede
|
||||||
|
Грустная сука
|
||||||
|
This World Is Sick
|
||||||
|
Пламя
|
||||||
|
2:45
|
||||||
|
who is he
|
||||||
|
Sleeping
|
||||||
|
Timeless
|
||||||
|
Pound for Pound
|
||||||
|
Finger Trembling
|
||||||
|
Overload
|
||||||
|
Kill Yourself (Part III)
|
||||||
|
2nd Hand
|
||||||
|
Antarctica
|
||||||
|
Memoirs Of A Gorilla
|
||||||
|
Runnin' Thru The 7th With My Woadies
|
||||||
|
Mount Sinai
|
||||||
|
FUCKTHEPOPULATION
|
||||||
|
Magazine
|
||||||
|
2 Hot 4 U (feat. $Uicdeboy$)
|
||||||
|
O Pana!
|
||||||
|
LTE
|
||||||
|
Champion Of Death
|
||||||
|
Seppuku (feat. Suicideboy$ & Jgrxxn)
|
||||||
|
You're Now Tuning Into 66.6 FM With DJ Rapture (The Hottest Hour Of The Evening)
|
||||||
|
Slip On A Banana Clip
|
||||||
|
A Death In The Ocean Would Be Beautiful
|
||||||
|
Shattered Amethyst
|
||||||
|
Goosebumps
|
||||||
|
Venom
|
||||||
|
Bury Me
|
||||||
|
Hack Slash
|
||||||
|
2000 Rounds
|
||||||
|
Sea Sick
|
||||||
|
Grain
|
||||||
|
"Beware
|
||||||
|
Kali Yuga
|
||||||
|
Hexada
|
||||||
|
Caligula
|
||||||
|
Niagara (feat. Lil Peep)
|
||||||
|
Scrying Through Shattered Glass
|
||||||
|
Polaris
|
||||||
|
Rapture
|
||||||
|
Blackmage
|
||||||
|
Tartarus
|
||||||
|
Until the Light Takes Us
|
||||||
|
As Above so Look out Below
|
||||||
|
Swan
|
||||||
|
Sneak Diss (feat. So6ix)
|
||||||
|
Plague Doctor Mask
|
||||||
|
Some of Us May Never See the World
|
||||||
|
Filth
|
||||||
|
Homecoming
|
||||||
|
Blood
|
||||||
|
Sweat
|
||||||
|
Tears
|
||||||
|
Anabolic
|
||||||
|
HDMI
|
||||||
|
Dirt
|
||||||
|
Oxygen
|
||||||
|
Branches
|
||||||
|
CtrlAltDelete
|
||||||
|
BlastZone (ЗонаПоражения)
|
||||||
|
CharacterSelect (ВыборПерсонажа)
|
||||||
|
RestInPeace (Prod. by The Virus And Antidote)
|
||||||
|
BlackMold
|
||||||
|
Toxin
|
||||||
|
Electric
|
||||||
|
Cranium
|
||||||
|
Friday
|
||||||
|
Hooky
|
||||||
|
Kalaxian Crystals
|
||||||
|
Slurp
|
||||||
|
BROKE ft. Prohibeo
|
||||||
|
Lies
|
||||||
|
Terry McGinnis
|
||||||
|
Gremlin
|
||||||
|
Giant Squit
|
||||||
|
You Are Not Like Us
|
||||||
|
Arachnids
|
||||||
|
Give Ah Fuck
|
||||||
|
Death Wish
|
||||||
|
Allergies
|
||||||
|
Cut Throat
|
||||||
|
Memoirs of a Gorilla
|
||||||
|
Benz Truck (гелик)
|
||||||
|
Norf Norf
|
||||||
|
Dat $tick
|
||||||
|
"RAF (feat. A$AP Rocky
|
||||||
|
Crazy
|
||||||
|
Still Cold / Pathway Private
|
||||||
|
The Chills
|
||||||
|
Slip on a Banana Clip
|
||||||
|
Lights
|
||||||
|
Akina Speed Star
|
||||||
|
Big Fish
|
||||||
|
The Bodies Fall Just Like the Leaves
|
||||||
|
Story: No Title
|
||||||
|
P.S Fuck You Cunt (feat. Lil Peep)
|
||||||
|
Torch
|
||||||
|
"Buff Squad (feat. Pouya
|
||||||
|
Sarcophagus III (feat. $Uicideboy$)
|
||||||
|
Virginia Tech
|
||||||
|
Lte
|
||||||
|
Fuckthepopulation
|
||||||
|
Gloss of Blood
|
||||||
|
100K
|
||||||
|
Dark Light
|
||||||
|
"But Wait
|
||||||
|
Great Influence
|
||||||
|
It Don't Matter
|
||||||
|
absolute in doubt
|
||||||
|
Boss
|
||||||
|
Look at Me Now
|
||||||
|
Bulletproof
|
||||||
|
Contraband
|
||||||
|
Deira City Centre
|
||||||
|
Kyoto
|
||||||
|
Pull Out Game
|
||||||
|
Bird Is The Word
|
||||||
|
Life Is Short
|
||||||
|
Here We Go Again
|
||||||
|
Bloodshed
|
||||||
|
Wassup Bro!
|
||||||
|
ACT 2 - BirthOfTheSpaceGod
|
||||||
|
Grey Tee
|
||||||
|
Sleeping Bag
|
||||||
|
Afterlife
|
||||||
|
King Cobra (Drippin')
|
||||||
|
Heart Attack
|
||||||
|
Chain$Aw
|
||||||
|
"King
|
||||||
|
P.T.S.D
|
||||||
|
Brand New
|
||||||
|
Jukai
|
||||||
|
Philosopher's Throne
|
||||||
|
PRBLMS
|
||||||
|
Back At It
|
||||||
|
Не удается отобразить этот файл, потому что он содержит неожиданный символ в строке 88 и столбце 1.
|
@ -2,6 +2,8 @@ import os
|
|||||||
import pytest
|
import pytest
|
||||||
import spacy
|
import spacy
|
||||||
import misinformation.text as tt
|
import misinformation.text as tt
|
||||||
|
import misinformation
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
TESTDICT = {
|
TESTDICT = {
|
||||||
"IMG_3755": {
|
"IMG_3755": {
|
||||||
@ -29,7 +31,6 @@ def test_TextDetector():
|
|||||||
assert test_obj.subdict["text_language"] is None
|
assert test_obj.subdict["text_language"] is None
|
||||||
assert test_obj.subdict["text_english"] is None
|
assert test_obj.subdict["text_english"] is None
|
||||||
assert not test_obj.analyse_text
|
assert not test_obj.analyse_text
|
||||||
assert not test_obj.analyse_topic
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.gcv
|
@pytest.mark.gcv
|
||||||
@ -39,7 +40,6 @@ def test_analyse_image():
|
|||||||
test_obj.analyse_image()
|
test_obj.analyse_image()
|
||||||
test_obj = tt.TextDetector(TESTDICT[item], analyse_text=True)
|
test_obj = tt.TextDetector(TESTDICT[item], analyse_text=True)
|
||||||
test_obj.analyse_image()
|
test_obj.analyse_image()
|
||||||
test_obj = tt.TextDetector(TESTDICT[item], analyse_topic=True)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.gcv
|
@pytest.mark.gcv
|
||||||
@ -68,6 +68,15 @@ def test_translate_text():
|
|||||||
assert test_obj.subdict["text_english"] == translated_text
|
assert test_obj.subdict["text_english"] == translated_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_linebreaks():
|
||||||
|
test_obj = tt.TextDetector({})
|
||||||
|
test_obj.subdict["text"] = "This is \n a test."
|
||||||
|
test_obj.subdict["text_english"] = "This is \n another\n test."
|
||||||
|
test_obj.remove_linebreaks()
|
||||||
|
assert test_obj.subdict["text"] == "This is a test."
|
||||||
|
assert test_obj.subdict["text_english"] == "This is another test."
|
||||||
|
|
||||||
|
|
||||||
def test_run_spacy():
|
def test_run_spacy():
|
||||||
test_obj = tt.TextDetector(TESTDICT["IMG_3755"], analyse_text=True)
|
test_obj = tt.TextDetector(TESTDICT["IMG_3755"], analyse_text=True)
|
||||||
ref_file = "./test/data/text_IMG_3755.txt"
|
ref_file = "./test/data/text_IMG_3755.txt"
|
||||||
@ -106,3 +115,34 @@ def test_sentiment_analysis():
|
|||||||
test_obj.sentiment_analysis()
|
test_obj.sentiment_analysis()
|
||||||
assert test_obj.subdict["polarity"] == 0.5
|
assert test_obj.subdict["polarity"] == 0.5
|
||||||
assert test_obj.subdict["subjectivity"] == 0.6
|
assert test_obj.subdict["subjectivity"] == 0.6
|
||||||
|
|
||||||
|
|
||||||
|
def test_PostprocessText():
|
||||||
|
reference_dict = "THE\nALGEBRAIC\nEIGENVALUE\nPROBLEM\nDOM\nNVS TIO\nMINA\nMonographs\non Numerical Analysis\nJ.. H. WILKINSON"
|
||||||
|
reference_df = "Mathematische Formelsammlung\nfür Ingenieure und Naturwissenschaftler\nMit zahlreichen Abbildungen und Rechenbeispielen\nund einer ausführlichen Integraltafel\n3., verbesserte Auflage"
|
||||||
|
obj = tt.PostprocessText(mydict=TESTDICT)
|
||||||
|
# make sure test works on windows where end-of-line character is \r\n
|
||||||
|
test_dict = obj.list_text_english[2].replace("\r", "")
|
||||||
|
assert test_dict == reference_dict
|
||||||
|
for key in TESTDICT.keys():
|
||||||
|
TESTDICT[key].pop("text_english")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
tt.PostprocessText(mydict=TESTDICT)
|
||||||
|
obj = tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out.csv")
|
||||||
|
# make sure test works on windows where end-of-line character is \r\n
|
||||||
|
test_df = obj.list_text_english[0].replace("\r", "")
|
||||||
|
assert test_df == reference_df
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out_nokey.csv")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
tt.PostprocessText()
|
||||||
|
|
||||||
|
|
||||||
|
def test_analyse_topic():
|
||||||
|
_, topic_df, most_frequent_topics = tt.PostprocessText(
|
||||||
|
use_csv=True, csv_path="./test/data/topic_analysis_test.csv"
|
||||||
|
).analyse_topic()
|
||||||
|
# since this is not deterministic we cannot be sure we get the same result twice
|
||||||
|
assert len(topic_df) == 2
|
||||||
|
assert topic_df["Name"].iloc[0] == "0_the_feat_of_is"
|
||||||
|
assert most_frequent_topics[0][0][0] == "the"
|
||||||
|
|||||||
@ -6,6 +6,9 @@ from textblob import TextBlob
|
|||||||
from textblob import download_corpora
|
from textblob import download_corpora
|
||||||
import io
|
import io
|
||||||
from misinformation import utils
|
from misinformation import utils
|
||||||
|
import grpc
|
||||||
|
import pandas as pd
|
||||||
|
from bertopic import BERTopic
|
||||||
|
|
||||||
# make widgets work again
|
# make widgets work again
|
||||||
# clean text has weird spaces and separation of "do n't"
|
# clean text has weird spaces and separation of "do n't"
|
||||||
@ -13,14 +16,11 @@ from misinformation import utils
|
|||||||
|
|
||||||
|
|
||||||
class TextDetector(utils.AnalysisMethod):
|
class TextDetector(utils.AnalysisMethod):
|
||||||
def __init__(
|
def __init__(self, subdict: dict, analyse_text: bool = False) -> None:
|
||||||
self, subdict: dict, analyse_text: bool = False, analyse_topic: bool = False
|
|
||||||
) -> None:
|
|
||||||
super().__init__(subdict)
|
super().__init__(subdict)
|
||||||
self.subdict.update(self.set_keys())
|
self.subdict.update(self.set_keys())
|
||||||
self.translator = Translator()
|
self.translator = Translator()
|
||||||
self.analyse_text = analyse_text
|
self.analyse_text = analyse_text
|
||||||
self.analyse_topic = analyse_topic
|
|
||||||
if self.analyse_text:
|
if self.analyse_text:
|
||||||
self._initialize_spacy()
|
self._initialize_spacy()
|
||||||
self._initialize_textblob()
|
self._initialize_textblob()
|
||||||
@ -46,13 +46,12 @@ class TextDetector(utils.AnalysisMethod):
|
|||||||
def analyse_image(self):
|
def analyse_image(self):
|
||||||
self.get_text_from_image()
|
self.get_text_from_image()
|
||||||
self.translate_text()
|
self.translate_text()
|
||||||
|
self.remove_linebreaks()
|
||||||
if self.analyse_text:
|
if self.analyse_text:
|
||||||
self._run_spacy()
|
self._run_spacy()
|
||||||
self.clean_text()
|
self.clean_text()
|
||||||
self.correct_spelling()
|
self.correct_spelling()
|
||||||
self.sentiment_analysis()
|
self.sentiment_analysis()
|
||||||
if self.analyse_topic:
|
|
||||||
self.analyse_topic()
|
|
||||||
return self.subdict
|
return self.subdict
|
||||||
|
|
||||||
def get_text_from_image(self):
|
def get_text_from_image(self):
|
||||||
@ -62,12 +61,19 @@ class TextDetector(utils.AnalysisMethod):
|
|||||||
with io.open(path, "rb") as image_file:
|
with io.open(path, "rb") as image_file:
|
||||||
content = image_file.read()
|
content = image_file.read()
|
||||||
image = vision.Image(content=content)
|
image = vision.Image(content=content)
|
||||||
|
# check for usual connection errors and retry if necessary
|
||||||
|
try:
|
||||||
response = client.text_detection(image=image)
|
response = client.text_detection(image=image)
|
||||||
|
except grpc.RpcError as exc:
|
||||||
|
print("Cloud vision API connection failed")
|
||||||
|
print("Skipping this image ..{}".format(path))
|
||||||
|
print("Connection failed with code {}: {}".format(exc.code(), exc))
|
||||||
|
# here check if text was found on image
|
||||||
|
if response:
|
||||||
texts = response.text_annotations[0].description
|
texts = response.text_annotations[0].description
|
||||||
# here check if text was found
|
|
||||||
if texts:
|
|
||||||
self.subdict["text"] = texts
|
self.subdict["text"] = texts
|
||||||
if response.error.message:
|
if response.error.message:
|
||||||
|
print("Google Cloud Vision Error")
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"{}\nFor more info on error messages, check: "
|
"{}\nFor more info on error messages, check: "
|
||||||
"https://cloud.google.com/apis/design/errors".format(
|
"https://cloud.google.com/apis/design/errors".format(
|
||||||
@ -80,6 +86,14 @@ class TextDetector(utils.AnalysisMethod):
|
|||||||
self.subdict["text_language"] = translated.src
|
self.subdict["text_language"] = translated.src
|
||||||
self.subdict["text_english"] = translated.text
|
self.subdict["text_english"] = translated.text
|
||||||
|
|
||||||
|
def remove_linebreaks(self):
|
||||||
|
"""Remove linebreaks from original and translated text."""
|
||||||
|
if self.subdict["text"]:
|
||||||
|
self.subdict["text"] = self.subdict["text"].replace("\n", " ")
|
||||||
|
self.subdict["text_english"] = self.subdict["text_english"].replace(
|
||||||
|
"\n", " "
|
||||||
|
)
|
||||||
|
|
||||||
def _run_spacy(self):
|
def _run_spacy(self):
|
||||||
"""Generate spacy doc object."""
|
"""Generate spacy doc object."""
|
||||||
self.doc = self.nlp(self.subdict["text_english"])
|
self.doc = self.nlp(self.subdict["text_english"])
|
||||||
@ -105,5 +119,73 @@ class TextDetector(utils.AnalysisMethod):
|
|||||||
# where 0.0 is very objective and 1.0 is very subjective
|
# where 0.0 is very objective and 1.0 is very subjective
|
||||||
self.subdict["subjectivity"] = self.doc._.blob.subjectivity
|
self.subdict["subjectivity"] = self.doc._.blob.subjectivity
|
||||||
|
|
||||||
def analyse_topic(self):
|
|
||||||
pass
|
class PostprocessText:
|
||||||
|
def __init__(
|
||||||
|
self, mydict: dict = None, use_csv: bool = False, csv_path: str = None
|
||||||
|
) -> None:
|
||||||
|
self.use_csv = use_csv
|
||||||
|
if mydict:
|
||||||
|
print("Reading data from dict.")
|
||||||
|
self.mydict = mydict
|
||||||
|
self.list_text_english = self.get_text_dict()
|
||||||
|
elif self.use_csv:
|
||||||
|
print("Reading data from df.")
|
||||||
|
self.df = pd.read_csv(csv_path, encoding="utf8")
|
||||||
|
self.list_text_english = self.get_text_df()
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Please provide either dictionary with textual data or \
|
||||||
|
a csv file by setting `use_csv` to True and providing a \
|
||||||
|
`csv_path`."
|
||||||
|
)
|
||||||
|
|
||||||
|
def analyse_topic(self, return_topics: int = 3):
|
||||||
|
"""Topic analysis using BERTopic."""
|
||||||
|
# load spacy pipeline
|
||||||
|
nlp = spacy.load(
|
||||||
|
"en_core_web_md",
|
||||||
|
exclude=["tagger", "parser", "ner", "attribute_ruler", "lemmatizer"],
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
# unfortunately catching exceptions does not work here - need to figure out why
|
||||||
|
self.topic_model = BERTopic(embedding_model=nlp)
|
||||||
|
except TypeError:
|
||||||
|
print("BERTopic excited with an error - maybe your dataset is too small?")
|
||||||
|
self.topics, self.probs = self.topic_model.fit_transform(self.list_text_english)
|
||||||
|
# return the topic list
|
||||||
|
topic_df = self.topic_model.get_topic_info()
|
||||||
|
# return the most frequent return_topics
|
||||||
|
most_frequent_topics = []
|
||||||
|
if len(topic_df) < return_topics:
|
||||||
|
print("You requested more topics than are identified in your dataset -")
|
||||||
|
print(
|
||||||
|
"Returning only {} topics as these are all that have been found.".format(
|
||||||
|
len(topic_df)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
for i in range(min(return_topics, len(topic_df))):
|
||||||
|
most_frequent_topics.append(self.topic_model.get_topic(i))
|
||||||
|
return self.topic_model, topic_df, most_frequent_topics
|
||||||
|
|
||||||
|
def get_text_dict(self):
|
||||||
|
# use dict to put text_english in list
|
||||||
|
list_text_english = []
|
||||||
|
for key in self.mydict.keys():
|
||||||
|
if "text_english" not in self.mydict[key]:
|
||||||
|
raise ValueError(
|
||||||
|
"Please check your provided dictionary - \
|
||||||
|
no english text data found."
|
||||||
|
)
|
||||||
|
list_text_english.append(self.mydict[key]["text_english"])
|
||||||
|
return list_text_english
|
||||||
|
|
||||||
|
def get_text_df(self):
|
||||||
|
# use csv file to obtain dataframe and put text_english in list
|
||||||
|
# check that "text_english" is there
|
||||||
|
if "text_english" not in self.df:
|
||||||
|
raise ValueError(
|
||||||
|
"Please check your provided dataframe - \
|
||||||
|
no english text data found."
|
||||||
|
)
|
||||||
|
return self.df["text_english"].tolist()
|
||||||
|
|||||||
157
notebooks/get-text-from-image.ipynb
сгенерированный
157
notebooks/get-text-from-image.ipynb
сгенерированный
@ -42,7 +42,18 @@
|
|||||||
"import os\n",
|
"import os\n",
|
||||||
"from IPython.display import Image, display\n",
|
"from IPython.display import Image, display\n",
|
||||||
"import misinformation\n",
|
"import misinformation\n",
|
||||||
|
"import tensorflow as tf\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"print(tf.config.list_physical_devices(\"GPU\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "27675810",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
"# download the models if they are not there yet\n",
|
"# download the models if they are not there yet\n",
|
||||||
"!python -m spacy download en_core_web_md\n",
|
"!python -m spacy download en_core_web_md\n",
|
||||||
"!python -m textblob.download_corpora"
|
"!python -m textblob.download_corpora"
|
||||||
@ -55,9 +66,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"images = misinformation.find_files(\n",
|
"images = misinformation.find_files(path=\"../data/all/\", limit=1000)"
|
||||||
" path=\"drive/MyDrive/misinformation-data/\", limit=1000\n",
|
|
||||||
")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -78,7 +87,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"mydict = misinformation.utils.initialize_dict(images[0:10])"
|
"mydict = misinformation.utils.initialize_dict(images[0:3])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -99,7 +108,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"os.environ[\n",
|
"os.environ[\n",
|
||||||
" \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
|
" \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
|
||||||
"] = \"drive/MyDrive/misinformation-data/misinformation-campaign-981aa55a3b13.json\""
|
"] = \"../data/misinformation-campaign-981aa55a3b13.json\""
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -180,13 +189,143 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Write the csv\n",
|
"# Write the csv\n",
|
||||||
"df.to_csv(\"drive/MyDrive/misinformation-data/data_out.csv\")"
|
"df.to_csv(\"./data_out.csv\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "4bc8ac0a",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Topic analysis\n",
|
||||||
|
"The topic analysis is carried out using [BERTopic](https://maartengr.github.io/BERTopic/index.html) using an embedded model through a [spaCy](https://spacy.io/) pipeline."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "4931941b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"BERTopic takes a list of strings as input. The more items in the list, the better for the topic modeling. If the below returns an error for `analyse_topic()`, the reason can be that your dataset is too small.\n",
|
||||||
|
"### Option 1: Use the dictionary as obtained from the above analysis."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "568537df",
|
"id": "a3450a61",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# make a list of all the text_english entries per analysed image from the mydict variable as above\n",
|
||||||
|
"topic_model, topic_df, most_frequent_topics = misinformation.text.PostprocessText(\n",
|
||||||
|
" mydict=mydict\n",
|
||||||
|
").analyse_topic()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "95667342",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Option 2: Read in a csv\n",
|
||||||
|
"Not to analyse too many images on google Cloud Vision, use the csv output to obtain the text (when rerunning already analysed images)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5530e436",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"input_file_path = \"data_out.csv\"\n",
|
||||||
|
"topic_model, topic_df, most_frequent_topics = misinformation.text.PostprocessText(\n",
|
||||||
|
" use_csv=True, csv_path=input_file_path\n",
|
||||||
|
").analyse_topic(return_topics=10)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "0b6ef6d7",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Access frequent topics\n",
|
||||||
|
"A topic of `-1` stands for an outlier and should be ignored. Topic count is the number of occurence of that topic. The output is structured from most frequent to least frequent topic."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "43288cda-61bb-4ff1-a209-dcfcc4916b1f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(topic_df)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "b3316770",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Get information for specific topic\n",
|
||||||
|
"The most frequent topics can be accessed through `most_frequent_topics` with the most occuring topics first in the list."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "db14fe03",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for topic in most_frequent_topics:\n",
|
||||||
|
" print(\"Topic:\", topic)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d10f701e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Topic visualization\n",
|
||||||
|
"The topics can also be visualized. Careful: This only works if there is sufficient data (quantity and quality)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "2331afe6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"topic_model.visualize_topics()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f4eaf353",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Save the model\n",
|
||||||
|
"The model can be saved for future use."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e5e8377c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"topic_model.save(\"misinfo_posts\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7c94edb9",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": []
|
||||||
@ -194,7 +333,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "misinf",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -208,7 +347,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6 (main, Oct 24 2022, 16:07:47) [GCC 11.2.0]"
|
"version": "3.10.6"
|
||||||
},
|
},
|
||||||
"vscode": {
|
"vscode": {
|
||||||
"interpreter": {
|
"interpreter": {
|
||||||
|
|||||||
@ -45,6 +45,9 @@ dependencies = [
|
|||||||
"jupyterlab",
|
"jupyterlab",
|
||||||
"spacytextblob",
|
"spacytextblob",
|
||||||
"textblob",
|
"textblob",
|
||||||
|
"bertopic",
|
||||||
|
"grpcio",
|
||||||
|
"pandas",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
|
|||||||
@ -23,3 +23,6 @@ jupyterlab
|
|||||||
spacytextblob
|
spacytextblob
|
||||||
textblob
|
textblob
|
||||||
git+https://github.com/sloria/TextBlob.git@dev
|
git+https://github.com/sloria/TextBlob.git@dev
|
||||||
|
bertopic
|
||||||
|
grpcio
|
||||||
|
pandas
|
||||||
Загрузка…
x
Ссылка в новой задаче
Block a user