Text cleanup and sentiment analysis (#49)

* update notebook

* comments

* add jupyterlab

* add text analysis capability

* add bool in tests

* add dependencies and spelling test

* add test sentiment

* update black pre-commit dependency for native nb support

* update black version, find better sentiment test

* test analyse_image
Этот коммит содержится в:
Inga Ulusoy 2023-01-11 12:58:02 +01:00 коммит произвёл GitHub
родитель b585097f19
Коммит 54728e02bb
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
9 изменённых файлов: 159 добавлений и 131 удалений

1
.github/workflows/ci.yml поставляемый
Просмотреть файл

@ -28,6 +28,7 @@ jobs:
python -m pip install --upgrade pip
python -m pip install -e .
python -m spacy download en_core_web_md
python -m textblob.download_corpora
- name: Run pytest
run: |
cd misinformation

Просмотреть файл

@ -8,10 +8,10 @@ repos:
rev: 22.12.0
hooks:
- id: black
- repo: https://github.com/dfm/black_nbconvert
rev: v0.4.0
- repo: https://github.com/psf/black
rev: 22.12.0
hooks:
- id: black_nbconvert
- id: black-jupyter
- repo: https://github.com/pycqa/flake8
rev: 6.0.0
hooks:

Просмотреть файл

@ -29,6 +29,18 @@ def test_TextDetector():
assert test_obj.subdict["text_language"] is None
assert test_obj.subdict["text_english"] is None
assert test_obj.subdict["text_cleaned"] is None
assert not test_obj.analyse_text
assert not test_obj.analyse_topic
@pytest.mark.gcv
def test_analyse_image():
for item in TESTDICT:
test_obj = tt.TextDetector(TESTDICT[item])
test_obj.analyse_image()
test_obj = tt.TextDetector(TESTDICT[item], analyse_text=True)
test_obj.analyse_image()
test_obj = tt.TextDetector(TESTDICT[item], analyse_topic=True)
@pytest.mark.gcv
@ -58,7 +70,7 @@ def test_translate_text():
def test_init_spacy():
test_obj = tt.TextDetector(TESTDICT["IMG_3755"])
test_obj = tt.TextDetector(TESTDICT["IMG_3755"], analyse_text=True)
ref_file = "./test/data/text_IMG_3755.txt"
with open(ref_file, "r") as file:
reference_text = file.read()
@ -75,3 +87,23 @@ def test_clean_text():
test_obj.clean_text()
result = "I like cats and"
assert test_obj.subdict["text_clean"] == result
def test_correct_spelling():
mydict = {}
test_obj = tt.TextDetector(mydict, analyse_text=True)
test_obj.subdict["text_english"] = "I lik cats ad dogs."
test_obj.correct_spelling()
result = "I like cats ad dogs."
assert test_obj.subdict["text_english_correct"] == result
def test_sentiment_analysis():
mydict = {}
test_obj = tt.TextDetector(mydict, analyse_text=True)
test_obj.subdict["text_english"] = "I love cats and dogs."
test_obj._init_spacy()
test_obj.correct_spelling()
test_obj.sentiment_analysis()
assert test_obj.subdict["polarity"] == 0.5
assert test_obj.subdict["subjectivity"] == 0.6

Просмотреть файл

@ -1,17 +1,29 @@
from google.cloud import vision
from googletrans import Translator
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from textblob import TextBlob
import io
from misinformation import utils
# make widgets work again
# clean text has weird spaces and separation of "do n't"
# increase coverage for text
class TextDetector(utils.AnalysisMethod):
def __init__(self, subdict: dict) -> None:
def __init__(
self, subdict: dict, analyse_text: bool = False, analyse_topic: bool = False
) -> None:
super().__init__(subdict)
self.subdict.update(self.set_keys())
self.translator = Translator()
# spacy load should be separaate method with error if model not found / dynamic download
self.nlp = spacy.load("en_core_web_md")
self.analyse_text = analyse_text
self.analyse_topic = analyse_topic
if self.analyse_text:
# spacy load should be separate method with error if model not found / dynamic download
self.nlp = spacy.load("en_core_web_md")
self.nlp.add_pipe("spacytextblob")
def set_keys(self) -> dict:
params = {
@ -25,8 +37,13 @@ class TextDetector(utils.AnalysisMethod):
def analyse_image(self):
self.get_text_from_image()
self.translate_text()
self._init_spacy()
self.clean_text()
if self.analyse_text:
self._init_spacy()
self.clean_text()
self.correct_spelling()
self.sentiment_analysis()
if self.analyse_topic:
self.analyse_topic()
return self.subdict
def get_text_from_image(self):
@ -65,3 +82,18 @@ class TextDetector(utils.AnalysisMethod):
token.text
) if token.pos_ != "NUM" and token.has_vector else None
self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()
def correct_spelling(self):
self.textblob = TextBlob(self.subdict["text_english"])
self.subdict["text_english_correct"] = str(self.textblob.correct())
def sentiment_analysis(self):
# self.subdict["sentiment"] = self.doc._.blob.sentiment_assessments.assessments
# polarity is between [-1.0, 1.0]
self.subdict["polarity"] = self.doc._.blob.polarity
# subjectivity is a float within the range [0.0, 1.0]
# where 0.0 is very objective and 1.0 is very subjective
self.subdict["subjectivity"] = self.doc._.blob.subjectivity
def analyse_topic(self):
pass

2
notebooks/facial_expressions.ipynb сгенерированный
Просмотреть файл

@ -201,7 +201,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.9.0"
}
},
"nbformat": 4,

140
notebooks/get-text-from-image.ipynb сгенерированный
Просмотреть файл

@ -28,7 +28,7 @@
"metadata": {},
"outputs": [],
"source": [
"images = misinformation.find_files(path=\"../data/images-little-text/\", limit=1000)"
"images = misinformation.find_files(path=\"../data/images-text/\", limit=1000)"
]
},
{
@ -42,28 +42,6 @@
" display(Image(filename=i))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "366e2060",
"metadata": {},
"outputs": [],
"source": [
"# start with only English\n",
"mysubfiles = [i for i in images if \"eng\" in i]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b330b267",
"metadata": {},
"outputs": [],
"source": [
"for i in mysubfiles[0:10]:\n",
" display(Image(filename=i))"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -102,7 +80,7 @@
"source": [
"os.environ[\n",
" \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
"] = \"../data/seismic-bonfire-329406-412821a70264.json\""
"] = \"../data/misinformation-campaign-981aa55a3b13.json\""
]
},
{
@ -143,14 +121,6 @@
" mydict[key] = misinformation.text.TextDetector(mydict[key]).analyse_image()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c75a2fff-9d59-4634-8d28-e90a909caa23",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
@ -158,120 +128,54 @@
"metadata": {},
"outputs": [],
"source": [
"print(mydict[\"104157S_eng\"][\"text\"])"
"print(mydict[\"109237S_spa\"][\"text_clean\"])"
]
},
{
"cell_type": "markdown",
"id": "3c063eda",
"metadata": {},
"source": [
"## Convert to dataframe and write csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3724f8d8-be0f-44eb-8ff9-b425eee94efc",
"id": "5709c2cd",
"metadata": {},
"outputs": [],
"source": [
"test = mydict[\"104157S_eng\"][\"text\"][0]\n",
"print(test)"
"outdict = misinformation.utils.append_data_to_dict(mydict)\n",
"df = misinformation.utils.dump_df(outdict)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2fb865ab-6d16-4a4c-b004-9d39fcba6812",
"id": "c4f05637",
"metadata": {},
"outputs": [],
"source": [
"# flake8-noqa-cell\n",
"from googletrans import Translator"
"# check the dataframe\n",
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "051ad473-8f75-40fc-ae90-7d8176cf816f",
"id": "bf6c9ddb",
"metadata": {},
"outputs": [],
"source": [
"translator = Translator()"
"# Write the csv\n",
"df.to_csv(\"./data_out.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b571d900-8829-4095-904f-dfee3ce46041",
"metadata": {},
"outputs": [],
"source": [
"result = translator.translate(test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1ff0b21-d2cc-4a50-8e86-a45362f1a0a1",
"metadata": {},
"outputs": [],
"source": [
"print(result.text)\n",
"print(result.src)\n",
"print(result.origin)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e135f00e-cdd5-4931-8649-ba0b293e8bdd",
"metadata": {},
"outputs": [],
"source": [
"# now clean the gibberish\n",
"# do spell check\n",
"# flake8-noqa-cell\n",
"import contextualSpellCheck\n",
"import spacy\n",
"\n",
"nlp = spacy.load(\"en_core_web_md\")\n",
"contextualSpellCheck.add_to_pipe(nlp)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "01d17f6e-30ae-4cc1-ad03-11a3be6847c2",
"metadata": {},
"outputs": [],
"source": [
"doc = nlp(result.text)\n",
"print(doc._.outcome_spellCheck)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0ef91dd2-70a8-4a04-b50f-5efba076bbb0",
"metadata": {},
"outputs": [],
"source": [
"final_text = []\n",
"for token in doc:\n",
" if token.pos_ != \"SPACE\":\n",
" if token.pos_ != \"NUM\":\n",
" if token.has_vector:\n",
" final_text.append(token.text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4d0a3360-3fd9-4ab3-b4dc-122e90c16c7c",
"metadata": {},
"outputs": [],
"source": [
"print(\" \".join(final_text))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b1abe05e-0c2b-4769-8fa5-fc67297288ea",
"id": "568537df",
"metadata": {},
"outputs": [],
"source": []
@ -293,7 +197,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.9.0"
},
"vscode": {
"interpreter": {

57
notebooks/objects_expression.ipynb сгенерированный
Просмотреть файл

@ -38,7 +38,7 @@
"outputs": [],
"source": [
"images = misinformation.find_files(\n",
" path=\"/home/inga/projects/misinformation-project/misinformation/data/test_no_text/\",\n",
" path=\"../data/images-little-text/\",\n",
" limit=1000,\n",
")"
]
@ -52,6 +52,15 @@
"mydict = misinformation.utils.initialize_dict(images)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mydict"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -136,6 +145,50 @@
"misinformation.explore_analysis(mydict, identify=\"objects\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def localize_objects(path):\n",
" \"\"\"Localize objects in the local image.\n",
"\n",
" Args:\n",
" path: The path to the local file.\n",
" \"\"\"\n",
" from google.cloud import vision\n",
"\n",
" client = vision.ImageAnnotatorClient()\n",
"\n",
" with open(path, \"rb\") as image_file:\n",
" content = image_file.read()\n",
" image = vision.Image(content=content)\n",
"\n",
" objects = client.object_localization(image=image).localized_object_annotations\n",
"\n",
" print(\"Number of objects found: {}\".format(len(objects)))\n",
" for object_ in objects:\n",
" print(\"\\n{} (confidence: {})\".format(object_.name, object_.score))\n",
" print(\"Normalized bounding polygon vertices: \")\n",
" for vertex in object_.bounding_poly.normalized_vertices:\n",
" print(\" - ({}, {})\".format(vertex.x, vertex.y))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"os.environ[\n",
" \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
"] = \"../../misinformation-notes/seismic-bonfire-329406-412821a70264.json\"\n",
"localize_objects(\"/home/iulusoy/Desktop/102141_2_eng.png\")"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -160,7 +213,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.5"
"version": "3.9.0"
},
"vscode": {
"interpreter": {

Просмотреть файл

@ -42,6 +42,9 @@ dependencies = [
"opencv-contrib-python",
"googletrans==3.1.0a0",
"spacy",
"jupyterlab",
"spacytextblob",
"textblob",
]
[project.scripts]

Просмотреть файл

@ -18,4 +18,7 @@ matplotlib
opencv-contrib-python
googletrans==3.1.0a0
spacy
https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz
https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz
jupyterlab
spacytextblob
textblob