Merge branch 'add_image_summary' of https://github.com/ssciwr/misinformation into add_image_summary

2025-10-29 13:06:04 +02:00 · 2023-01-30 12:21:42 +01:00 · 2023-01-30 12:21:42 +01:00 · a2c0bcd66c
--- a/README.md
+++ b/README.md
@ -11,12 +11,13 @@ Extract data from from social media images and texts in disinformation campaigns
 **_This project is currently under development!_**

 Use the pre-processed social media posts (image files) and process to collect information:
+1. Cropping images to remove comments from posts
 1. Text extraction from the images
-1. Improving the preparation of the text for the data analysis (e.g., text cleaning)
-1. Performing person and face recognition in images, facial expressions recognition, as well as the extraction of any other available individual characteristics (e.g., gender, clothes) 
+1. Language recognition, translation into English, cleaning of the text/spell-check
+1. Sentiment and subjectivity analysis
+1. Performing person and face recognition in images, emotion recognition
 1. Extraction of other non-human objects in the image
-1. 5-Color analysis of the images
-  
+ 
 This development will serve the fight to combat misinformation, by providing more comprehensive data about its content and techniques. 
 The ultimate goal of this project is to develop a computer-assisted toolset to investigate the content of disinformation campaigns worldwide. 

@ -30,8 +31,30 @@ This will install the package and its dependencies locally.

 # Usage

-There are sample notebooks in the `misinformation/notebooks` folder for you to explore the package usage:
+There are sample notebooks in the `misinformation/notebooks` folder for you to explore the package:
+1. Text analysis: Use the notebook `get-text-from-image.ipynb` to extract any text from the images. The text is directly translated into English. If the text should be further analysed, set the keyword `analyse_text` to `True` as demonstrated in the notebook.\
+**You can run this notebook on google colab: [Here](https://colab.research.google.com/github/ssciwr/misinformation/blob/main/notebooks/get-text-from-image.ipynb)**  
+Place the data files and google cloud vision API key in your google drive to access the data.
 1. Facial analysis: Use the notebook `facial_expressions.ipynb` to identify if there are faces on the image, if they are wearing masks, and if they are not wearing masks also the race, gender and dominant emotion.
+**You can run this notebook on google colab: [Here](https://colab.research.google.com/github/ssciwr/misinformation/blob/main/notebooks/facial_expressions.ipynb)**   
+Place the data files in your google drive to access the data.**
 1. Object analysis: Use the notebook `ojects_expression.ipynb` to identify certain objects in the image. Currently, the following objects are being identified: person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, cell phone.

-There are further notebooks that are currently of exploratory nature (`colors_expression` to identify certain colors on the image, `get-text-from-image` to extract text that is contained in an image.)
+There are further notebooks that are currently of exploratory nature (`colors_expression.ipynb` to identify certain colors on the image).
+
+# Features
+## Text extraction
+The text is extracted from the images using [`google-cloud-vision`](https://cloud.google.com/vision). For this, you need an API key. Set up your google account following the instructions on the google Vision AI website.
+You then need to export the location of the API key as an environment variable:
+`export GOOGLE_APPLICATION_CREDENTIALS="location of your .json"`
+The extracted text is then stored under the `text` key (column when exporting a csv).
+
+[Googletrans](https://py-googletrans.readthedocs.io/en/latest/) is used to recognize the language automatically and translate into English. The text language and translated text is then stored under the `text_language` and `text_english` key (column when exporting a csv).
+
+If you further want to analyse the text, you have to set the `analyse_text` keyword to `True`. In doing so, the text is then processed using [spacy](https://spacy.io/) (tokenized, part-of-speech, lemma, ...). The English text is cleaned from numbers and unrecognized words (`text_clean`), spelling of the English text is corrected (`text_english_correct`), and further sentiment and subjectivity analysis are carried out (`polarity`, `subjectivity`). The latter two steps are carried out using [TextBlob](https://textblob.readthedocs.io/en/dev/index.html). For more information on the sentiment analysis using TextBlob see [here](https://towardsdatascience.com/my-absolute-go-to-for-sentiment-analysis-textblob-3ac3a11d524).
+
+## Emotion recognition
+
+## Object detection
+
+## Cropping of posts
--- a/misinformation/test/test_text.py
+++ b/misinformation/test/test_text.py
@ -28,7 +28,6 @@ def test_TextDetector():
        assert test_obj.subdict["text"] is None
        assert test_obj.subdict["text_language"] is None
        assert test_obj.subdict["text_english"] is None
-        assert test_obj.subdict["text_cleaned"] is None
        assert not test_obj.analyse_text
        assert not test_obj.analyse_topic

@ -69,13 +68,13 @@ def test_translate_text():
        assert test_obj.subdict["text_english"] == translated_text


-def test_init_spacy():
+def test_run_spacy():
    test_obj = tt.TextDetector(TESTDICT["IMG_3755"], analyse_text=True)
    ref_file = "./test/data/text_IMG_3755.txt"
    with open(ref_file, "r") as file:
        reference_text = file.read()
    test_obj.subdict["text_english"] = reference_text
-    test_obj._init_spacy()
+    test_obj._run_spacy()
    assert isinstance(test_obj.doc, spacy.tokens.doc.Doc)


@ -102,7 +101,7 @@ def test_sentiment_analysis():
    mydict = {}
    test_obj = tt.TextDetector(mydict, analyse_text=True)
    test_obj.subdict["text_english"] = "I love cats and dogs."
-    test_obj._init_spacy()
+    test_obj._run_spacy()
    test_obj.correct_spelling()
    test_obj.sentiment_analysis()
    assert test_obj.subdict["polarity"] == 0.5
--- a/misinformation/text.py
+++ b/misinformation/text.py
@ -3,6 +3,7 @@ from googletrans import Translator
 import spacy
 from spacytextblob.spacytextblob import SpacyTextBlob
 from textblob import TextBlob
+from textblob import download_corpora
 import io
 from misinformation import utils

@ -21,24 +22,32 @@ class TextDetector(utils.AnalysisMethod):
        self.analyse_text = analyse_text
        self.analyse_topic = analyse_topic
        if self.analyse_text:
-            # spacy load should be separate method with error if model not found / dynamic download
-            self.nlp = spacy.load("en_core_web_md")
-            self.nlp.add_pipe("spacytextblob")
+            self._initialize_spacy()
+            self._initialize_textblob()

    def set_keys(self) -> dict:
-        params = {
-            "text": None,
-            "text_language": None,
-            "text_english": None,
-            "text_cleaned": None,
-        }
+        params = {"text": None, "text_language": None, "text_english": None}
        return params

+    def _initialize_spacy(self):
+        try:
+            self.nlp = spacy.load("en_core_web_md")
+        except Exception:
+            spacy.cli.download("en_core_web_md")
+            self.nlp = spacy.load("en_core_web_md")
+        self.nlp.add_pipe("spacytextblob")
+
+    def _initialize_textblob(self):
+        try:
+            TextBlob("Here")
+        except Exception:
+            download_corpora.main()
+
    def analyse_image(self):
        self.get_text_from_image()
        self.translate_text()
        if self.analyse_text:
-            self._init_spacy()
+            self._run_spacy()
            self.clean_text()
            self.correct_spelling()
            self.sentiment_analysis()
@ -56,7 +65,8 @@ class TextDetector(utils.AnalysisMethod):
        response = client.text_detection(image=image)
        texts = response.text_annotations[0].description
        # here check if text was found
-        self.subdict = {"text": texts}
+        if texts:
+            self.subdict["text"] = texts
        if response.error.message:
            raise ValueError(
                "{}\nFor more info on error messages, check: "
@ -70,7 +80,7 @@ class TextDetector(utils.AnalysisMethod):
        self.subdict["text_language"] = translated.src
        self.subdict["text_english"] = translated.text

-    def _init_spacy(self):
+    def _run_spacy(self):
        """Generate spacy doc object."""
        self.doc = self.nlp(self.subdict["text_english"])

--- a/notebooks/facial_expressions.ipynb
+++ b/notebooks/facial_expressions.ipynb
@ -16,6 +16,29 @@
    "This notebooks shows some preliminary work on detecting facial expressions with DeepFace. It is mainly meant to explore its capabilities and to decide on future research directions. We package our code into a `misinformation` package that is imported here:"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50c1c1c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if running on google colab\n",
+    "# flake8-noqa-cell\n",
+    "import os\n",
+    "\n",
+    "if \"google.colab\" in str(get_ipython()):\n",
+    "    # update python version\n",
+    "    # install setuptools\n",
+    "    !pip install setuptools==61 -qqq\n",
+    "    # install misinformation\n",
+    "    !pip install git+https://github.com/ssciwr/misinformation.git -qqq\n",
+    "    # mount google drive for data and API key\n",
+    "    from google.colab import drive\n",
+    "\n",
+    "    drive.mount(\"/content/drive\")"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -42,7 +65,7 @@
   "outputs": [],
   "source": [
    "images = misinformation.find_files(\n",
-    "    path=\"../data/test_no_text/\",\n",
+    "    path=\"drive/MyDrive/misinformation-data/\",\n",
    "    limit=1000,\n",
    ")"
   ]
@ -173,7 +196,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "df.to_csv(\"./data_out.csv\")"
+    "df.to_csv(\"drive/MyDrive/misinformation-data//data_out.csv\")"
   ]
  },
  {
@ -187,7 +210,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "misinf",
   "language": "python",
   "name": "python3"
  },
@ -201,7 +224,12 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.0"
+   "version": "3.10.6 (main, Oct 24 2022, 16:07:47) [GCC 11.2.0]"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "da98320027a74839c7141b42ef24e2d47d628ba1f51115c13da5d8b45a372ec2"
+   }
  }
 },
 "nbformat": 4,
--- a/notebooks/get-text-from-image.ipynb
+++ b/notebooks/get-text-from-image.ipynb
@ -9,6 +9,29 @@
    "Inga Ulusoy, SSC, July 2022"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f43f327c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if running on google colab\n",
+    "# flake8-noqa-cell\n",
+    "import os\n",
+    "\n",
+    "if \"google.colab\" in str(get_ipython()):\n",
+    "    # update python version\n",
+    "    # install setuptools\n",
+    "    !pip install setuptools==61 -qqq\n",
+    "    # install misinformation\n",
+    "    !pip install git+https://github.com/ssciwr/misinformation.git -qqq\n",
+    "    # mount google drive for data and API key\n",
+    "    from google.colab import drive\n",
+    "\n",
+    "    drive.mount(\"/content/drive\")"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -18,7 +41,11 @@
   "source": [
    "import os\n",
    "from IPython.display import Image, display\n",
-    "import misinformation"
+    "import misinformation\n",
+    "\n",
+    "# download the models if they are not there yet\n",
+    "!python -m spacy download en_core_web_md\n",
+    "!python -m textblob.download_corpora"
   ]
  },
  {
@ -28,7 +55,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "images = misinformation.find_files(path=\"../data/images-text/\", limit=1000)"
+    "images = misinformation.find_files(\n",
+    "    path=\"drive/MyDrive/misinformation-data/\", limit=1000\n",
+    ")"
   ]
  },
  {
@ -38,7 +67,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "for i in images[0:10]:\n",
+    "for i in images[0:3]:\n",
    "    display(Image(filename=i))"
   ]
  },
@ -52,16 +81,6 @@
    "mydict = misinformation.utils.initialize_dict(images[0:10])"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3be954ef-d31f-4e4d-857c-c14d5fda91f1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mydict"
-   ]
-  },
  {
   "cell_type": "markdown",
   "id": "7b8b929f",
@ -80,7 +99,7 @@
   "source": [
    "os.environ[\n",
    "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
-    "] = \"../data/misinformation-campaign-981aa55a3b13.json\""
+    "] = \"drive/MyDrive/misinformation-data/misinformation-campaign-981aa55a3b13.json\""
   ]
  },
  {
@ -118,17 +137,9 @@
   "source": [
    "for key in mydict:\n",
    "    print(key)\n",
-    "    mydict[key] = misinformation.text.TextDetector(mydict[key]).analyse_image()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c978fdb4-1f3a-4b78-b6ff-79c6e8a6fe82",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(mydict[\"109237S_spa\"][\"text_clean\"])"
+    "    mydict[key] = misinformation.text.TextDetector(\n",
+    "        mydict[key], analyse_text=True\n",
+    "    ).analyse_image()"
   ]
  },
  {
@ -169,7 +180,7 @@
   "outputs": [],
   "source": [
    "# Write the csv\n",
-    "df.to_csv(\"./data_out.csv\")"
+    "df.to_csv(\"drive/MyDrive/misinformation-data/data_out.csv\")"
   ]
  },
  {
@ -183,7 +194,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "misinf",
   "language": "python",
   "name": "python3"
  },
@ -197,11 +208,11 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.0"
+   "version": "3.10.6 (main, Oct 24 2022, 16:07:47) [GCC 11.2.0]"
  },
  "vscode": {
   "interpreter": {
-    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
+    "hash": "da98320027a74839c7141b42ef24e2d47d628ba1f51115c13da5d8b45a372ec2"
   }
  }
 },
--- a/pyproject.toml
+++ b/pyproject.toml
@ -14,7 +14,7 @@ maintainers = [
    { name = "Dominic Kempf", email = "ssc@iwr.uni-heidelberg.de" },
    { name = "Petr Andriushchenko", email = "ssc@iwr.uni-heidelberg.de" },
 ]
-requires-python = ">=3.9"
+requires-python = ">=3.8"
 license = { text = "MIT" }
 classifiers = [
    "Programming Language :: Python :: 3",
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,6 @@
 google-cloud-vision
 cvlib
-deepface
+deepface<=0.0.75
 ipywidgets
 numpy<=1.23.4
 opencv_python
@ -21,4 +21,5 @@ spacy
 https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz
 jupyterlab
 spacytextblob
-textblob
+textblob
+git+https://github.com/sloria/TextBlob.git@dev