Text cleanup and sentiment analysis (#49)

* update notebook * comments * add jupyterlab * add text analysis capability * add bool in tests * add dependencies and spelling test * add test sentiment * update black pre-commit dependency for native nb support * update black version, find better sentiment test * test analyse_image
2025-10-29 13:06:04 +02:00 · 2023-01-11 12:58:02 +01:00 · 2023-01-11 12:58:02 +01:00 · 54728e02bb
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -28,6 +28,7 @@ jobs:
        python -m pip install --upgrade pip
        python -m pip install -e .
        python -m spacy download en_core_web_md
+        python -m textblob.download_corpora
    - name: Run pytest
      run: |
        cd misinformation
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -8,10 +8,10 @@ repos:
    rev: 22.12.0
    hooks:
    - id: black
-  - repo: https://github.com/dfm/black_nbconvert
-    rev: v0.4.0
+  - repo: https://github.com/psf/black
+    rev: 22.12.0
    hooks:
-      - id: black_nbconvert
+      - id: black-jupyter
  -   repo: https://github.com/pycqa/flake8
      rev: 6.0.0 
      hooks:
--- a/misinformation/test/test_text.py
+++ b/misinformation/test/test_text.py
@ -29,6 +29,18 @@ def test_TextDetector():
        assert test_obj.subdict["text_language"] is None
        assert test_obj.subdict["text_english"] is None
        assert test_obj.subdict["text_cleaned"] is None
+        assert not test_obj.analyse_text
+        assert not test_obj.analyse_topic
+
+
+@pytest.mark.gcv
+def test_analyse_image():
+    for item in TESTDICT:
+        test_obj = tt.TextDetector(TESTDICT[item])
+        test_obj.analyse_image()
+        test_obj = tt.TextDetector(TESTDICT[item], analyse_text=True)
+        test_obj.analyse_image()
+        test_obj = tt.TextDetector(TESTDICT[item], analyse_topic=True)


@pytest.mark.gcv
@ -58,7 +70,7 @@ def test_translate_text():


 def test_init_spacy():
-    test_obj = tt.TextDetector(TESTDICT["IMG_3755"])
+    test_obj = tt.TextDetector(TESTDICT["IMG_3755"], analyse_text=True)
    ref_file = "./test/data/text_IMG_3755.txt"
    with open(ref_file, "r") as file:
        reference_text = file.read()
@ -75,3 +87,23 @@ def test_clean_text():
    test_obj.clean_text()
    result = "I like cats and"
    assert test_obj.subdict["text_clean"] == result
+
+
+def test_correct_spelling():
+    mydict = {}
+    test_obj = tt.TextDetector(mydict, analyse_text=True)
+    test_obj.subdict["text_english"] = "I lik cats ad dogs."
+    test_obj.correct_spelling()
+    result = "I like cats ad dogs."
+    assert test_obj.subdict["text_english_correct"] == result
+
+
+def test_sentiment_analysis():
+    mydict = {}
+    test_obj = tt.TextDetector(mydict, analyse_text=True)
+    test_obj.subdict["text_english"] = "I love cats and dogs."
+    test_obj._init_spacy()
+    test_obj.correct_spelling()
+    test_obj.sentiment_analysis()
+    assert test_obj.subdict["polarity"] == 0.5
+    assert test_obj.subdict["subjectivity"] == 0.6
--- a/misinformation/text.py
+++ b/misinformation/text.py
@ -1,17 +1,29 @@
 from google.cloud import vision
 from googletrans import Translator
 import spacy
+from spacytextblob.spacytextblob import SpacyTextBlob
+from textblob import TextBlob
 import io
 from misinformation import utils

+# make widgets work again
+# clean text has weird spaces and separation of "do n't"
+# increase coverage for text
+

 class TextDetector(utils.AnalysisMethod):
-    def __init__(self, subdict: dict) -> None:
+    def __init__(
+        self, subdict: dict, analyse_text: bool = False, analyse_topic: bool = False
+    ) -> None:
        super().__init__(subdict)
        self.subdict.update(self.set_keys())
        self.translator = Translator()
-        # spacy load should be separaate method with error if model not found / dynamic download
-        self.nlp = spacy.load("en_core_web_md")
+        self.analyse_text = analyse_text
+        self.analyse_topic = analyse_topic
+        if self.analyse_text:
+            # spacy load should be separate method with error if model not found / dynamic download
+            self.nlp = spacy.load("en_core_web_md")
+            self.nlp.add_pipe("spacytextblob")

    def set_keys(self) -> dict:
        params = {
@ -25,8 +37,13 @@ class TextDetector(utils.AnalysisMethod):
    def analyse_image(self):
        self.get_text_from_image()
        self.translate_text()
-        self._init_spacy()
-        self.clean_text()
+        if self.analyse_text:
+            self._init_spacy()
+            self.clean_text()
+            self.correct_spelling()
+            self.sentiment_analysis()
+        if self.analyse_topic:
+            self.analyse_topic()
        return self.subdict

    def get_text_from_image(self):
@ -65,3 +82,18 @@ class TextDetector(utils.AnalysisMethod):
                token.text
            ) if token.pos_ != "NUM" and token.has_vector else None
        self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()
+
+    def correct_spelling(self):
+        self.textblob = TextBlob(self.subdict["text_english"])
+        self.subdict["text_english_correct"] = str(self.textblob.correct())
+
+    def sentiment_analysis(self):
+        # self.subdict["sentiment"] = self.doc._.blob.sentiment_assessments.assessments
+        # polarity is between [-1.0, 1.0]
+        self.subdict["polarity"] = self.doc._.blob.polarity
+        # subjectivity is a float within the range [0.0, 1.0]
+        # where 0.0 is very objective and 1.0 is very subjective
+        self.subdict["subjectivity"] = self.doc._.blob.subjectivity
+
+    def analyse_topic(self):
+        pass
--- a/notebooks/facial_expressions.ipynb
+++ b/notebooks/facial_expressions.ipynb
@ -201,7 +201,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.9.0"
  }
 },
 "nbformat": 4,
--- a/notebooks/get-text-from-image.ipynb
+++ b/notebooks/get-text-from-image.ipynb
@ -28,7 +28,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "images = misinformation.find_files(path=\"../data/images-little-text/\", limit=1000)"
+    "images = misinformation.find_files(path=\"../data/images-text/\", limit=1000)"
   ]
  },
  {
@ -42,28 +42,6 @@
    "    display(Image(filename=i))"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "366e2060",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# start with only English\n",
-    "mysubfiles = [i for i in images if \"eng\" in i]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b330b267",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for i in mysubfiles[0:10]:\n",
-    "    display(Image(filename=i))"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -102,7 +80,7 @@
   "source": [
    "os.environ[\n",
    "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
-    "] = \"../data/seismic-bonfire-329406-412821a70264.json\""
+    "] = \"../data/misinformation-campaign-981aa55a3b13.json\""
   ]
  },
  {
@ -143,14 +121,6 @@
    "    mydict[key] = misinformation.text.TextDetector(mydict[key]).analyse_image()"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c75a2fff-9d59-4634-8d28-e90a909caa23",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -158,120 +128,54 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "print(mydict[\"104157S_eng\"][\"text\"])"
+    "print(mydict[\"109237S_spa\"][\"text_clean\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c063eda",
+   "metadata": {},
+   "source": [
+    "## Convert to dataframe and write csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "3724f8d8-be0f-44eb-8ff9-b425eee94efc",
+   "id": "5709c2cd",
   "metadata": {},
   "outputs": [],
   "source": [
-    "test = mydict[\"104157S_eng\"][\"text\"][0]\n",
-    "print(test)"
+    "outdict = misinformation.utils.append_data_to_dict(mydict)\n",
+    "df = misinformation.utils.dump_df(outdict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "2fb865ab-6d16-4a4c-b004-9d39fcba6812",
+   "id": "c4f05637",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# flake8-noqa-cell\n",
-    "from googletrans import Translator"
+    "# check the dataframe\n",
+    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "051ad473-8f75-40fc-ae90-7d8176cf816f",
+   "id": "bf6c9ddb",
   "metadata": {},
   "outputs": [],
   "source": [
-    "translator = Translator()"
+    "# Write the csv\n",
+    "df.to_csv(\"./data_out.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "b571d900-8829-4095-904f-dfee3ce46041",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "result = translator.translate(test)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e1ff0b21-d2cc-4a50-8e86-a45362f1a0a1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(result.text)\n",
-    "print(result.src)\n",
-    "print(result.origin)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e135f00e-cdd5-4931-8649-ba0b293e8bdd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# now clean the gibberish\n",
-    "# do spell check\n",
-    "# flake8-noqa-cell\n",
-    "import contextualSpellCheck\n",
-    "import spacy\n",
-    "\n",
-    "nlp = spacy.load(\"en_core_web_md\")\n",
-    "contextualSpellCheck.add_to_pipe(nlp)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "01d17f6e-30ae-4cc1-ad03-11a3be6847c2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "doc = nlp(result.text)\n",
-    "print(doc._.outcome_spellCheck)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0ef91dd2-70a8-4a04-b50f-5efba076bbb0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "final_text = []\n",
-    "for token in doc:\n",
-    "    if token.pos_ != \"SPACE\":\n",
-    "        if token.pos_ != \"NUM\":\n",
-    "            if token.has_vector:\n",
-    "                final_text.append(token.text)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4d0a3360-3fd9-4ab3-b4dc-122e90c16c7c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\" \".join(final_text))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b1abe05e-0c2b-4769-8fa5-fc67297288ea",
+   "id": "568537df",
   "metadata": {},
   "outputs": [],
   "source": []
@ -293,7 +197,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.9.0"
  },
  "vscode": {
   "interpreter": {
--- a/notebooks/objects_expression.ipynb
+++ b/notebooks/objects_expression.ipynb
@ -38,7 +38,7 @@
   "outputs": [],
   "source": [
    "images = misinformation.find_files(\n",
-    "    path=\"/home/inga/projects/misinformation-project/misinformation/data/test_no_text/\",\n",
+    "    path=\"../data/images-little-text/\",\n",
    "    limit=1000,\n",
    ")"
   ]
@ -52,6 +52,15 @@
    "mydict = misinformation.utils.initialize_dict(images)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mydict"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -136,6 +145,50 @@
    "misinformation.explore_analysis(mydict, identify=\"objects\")"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def localize_objects(path):\n",
+    "    \"\"\"Localize objects in the local image.\n",
+    "\n",
+    "    Args:\n",
+    "    path: The path to the local file.\n",
+    "    \"\"\"\n",
+    "    from google.cloud import vision\n",
+    "\n",
+    "    client = vision.ImageAnnotatorClient()\n",
+    "\n",
+    "    with open(path, \"rb\") as image_file:\n",
+    "        content = image_file.read()\n",
+    "    image = vision.Image(content=content)\n",
+    "\n",
+    "    objects = client.object_localization(image=image).localized_object_annotations\n",
+    "\n",
+    "    print(\"Number of objects found: {}\".format(len(objects)))\n",
+    "    for object_ in objects:\n",
+    "        print(\"\\n{} (confidence: {})\".format(object_.name, object_.score))\n",
+    "        print(\"Normalized bounding polygon vertices: \")\n",
+    "        for vertex in object_.bounding_poly.normalized_vertices:\n",
+    "            print(\" - ({}, {})\".format(vertex.x, vertex.y))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\n",
+    "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
+    "] = \"../../misinformation-notes/seismic-bonfire-329406-412821a70264.json\"\n",
+    "localize_objects(\"/home/iulusoy/Desktop/102141_2_eng.png\")"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -160,7 +213,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.5"
+   "version": "3.9.0"
  },
  "vscode": {
   "interpreter": {
--- a/pyproject.toml
+++ b/pyproject.toml
@ -42,6 +42,9 @@ dependencies = [
    "opencv-contrib-python",
    "googletrans==3.1.0a0",
    "spacy",
+    "jupyterlab",
+    "spacytextblob",
+    "textblob",
 ]

 [project.scripts]
--- a/requirements.txt
+++ b/requirements.txt
@ -18,4 +18,7 @@ matplotlib
 opencv-contrib-python
 googletrans==3.1.0a0
 spacy
-https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz
+https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz
+jupyterlab
+spacytextblob
+textblob