{ "cells": [ { "cell_type": "markdown", "id": "dcaa3da1", "metadata": {}, "source": [ "# Text extraction on image\n", "Inga Ulusoy, SSC, July 2022" ] }, { "cell_type": "code", "execution_count": 1, "id": "f43f327c", "metadata": { "execution": { "iopub.execute_input": "2023-04-19T11:49:14.733285Z", "iopub.status.busy": "2023-04-19T11:49:14.732783Z", "iopub.status.idle": "2023-04-19T11:49:14.746068Z", "shell.execute_reply": "2023-04-19T11:49:14.745157Z" }, "tags": [] }, "outputs": [], "source": [ "# if running on google colab\n", "# flake8-noqa-cell\n", "import os\n", "\n", "if \"google.colab\" in str(get_ipython()):\n", " # update python version\n", " # install setuptools\n", " !pip install setuptools==61 -qqq\n", " # install misinformation\n", " !pip install git+https://github.com/ssciwr/misinformation.git -qqq\n", " # mount google drive for data and API key\n", " from google.colab import drive\n", "\n", " drive.mount(\"/content/drive\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "cf362e60", "metadata": { "execution": { "iopub.execute_input": "2023-04-19T11:49:14.749889Z", "iopub.status.busy": "2023-04-19T11:49:14.749592Z", "iopub.status.idle": "2023-04-19T11:49:37.034945Z", "shell.execute_reply": "2023-04-19T11:49:37.034019Z" }, "tags": [] }, "outputs": [], "source": [ "import misinformation\n", "from misinformation import utils as mutils\n", "from misinformation import display as mdisplay" ] }, { "cell_type": "code", "execution_count": 3, "id": "27675810", "metadata": { "execution": { "iopub.execute_input": "2023-04-19T11:49:37.040153Z", "iopub.status.busy": "2023-04-19T11:49:37.038794Z", "iopub.status.idle": "2023-04-19T11:49:52.898592Z", "shell.execute_reply": "2023-04-19T11:49:52.897342Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting en-core-web-md==3.5.0\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)\r\n", "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/42.8 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.1/42.8 MB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:26\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/42.8 MB\u001b[0m \u001b[31m17.6 MB/s\u001b[0m eta \u001b[36m0:00:03\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.2/42.8 MB\u001b[0m \u001b[31m40.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/42.8 MB\u001b[0m \u001b[31m53.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.9/42.8 MB\u001b[0m \u001b[31m90.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.3/42.8 MB\u001b[0m \u001b[31m93.5 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.6/42.8 MB\u001b[0m \u001b[31m92.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.0/42.8 MB\u001b[0m \u001b[31m93.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.5/42.8 MB\u001b[0m \u001b[31m94.3 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━\u001b[0m \u001b[32m27.9/42.8 MB\u001b[0m \u001b[31m98.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━\u001b[0m \u001b[32m31.2/42.8 MB\u001b[0m \u001b[31m97.8 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━\u001b[0m \u001b[32m34.6/42.8 MB\u001b[0m \u001b[31m95.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━\u001b[0m \u001b[32m37.9/42.8 MB\u001b[0m \u001b[31m93.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m41.8/42.8 MB\u001b[0m \u001b[31m100.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m100.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m100.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m100.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m100.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m38.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n", "\u001b[?25h" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: spacy<3.6.0,>=3.5.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from en-core-web-md==3.5.0) (3.5.2)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.0.9)\r\n", "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.0.8)\r\n", "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.0.7)\r\n", "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.4.6)\r\n", "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.7.0)\r\n", "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.0.12)\r\n", "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.3.0)\r\n", "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.28.2)\r\n", "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.10.7)\r\n", "Requirement already satisfied: pathy>=0.10.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.10.1)\r\n", "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (4.65.0)\r\n", "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.1.1)\r\n", "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.0.8)\r\n", "Requirement already satisfied: thinc<8.2.0,>=8.1.8 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (8.1.9)\r\n", "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.0.4)\r\n", "Requirement already satisfied: packaging>=20.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (23.1)\r\n", "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (6.3.0)\r\n", "Requirement already satisfied: numpy>=1.15.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.23.4)\r\n", "Requirement already satisfied: jinja2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.1.2)\r\n", "Requirement already satisfied: setuptools in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (58.1.0)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (4.5.0)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.26.15)\r\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.1.0)\r\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.10)\r\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2022.12.7)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.0.4)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.7.9)\r\n", "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (8.1.3)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: MarkupSafe>=2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from jinja2->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.1.2)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Installing collected packages: en-core-web-md\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Successfully installed en-core-web-md-3.5.0\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1\u001b[0m\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\r\n", "You can now load the package via spacy.load('en_core_web_md')\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package brown to /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping corpora/brown.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping tokenizers/punkt.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package averaged_perceptron_tagger to\r\n", "[nltk_data] /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package conll2000 to /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping corpora/conll2000.zip.\r\n", "[nltk_data] Downloading package movie_reviews to\r\n", "[nltk_data] /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping corpora/movie_reviews.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Finished.\r\n" ] } ], "source": [ "# download the models if they are not there yet\n", "!python -m spacy download en_core_web_md\n", "!python -m textblob.download_corpora" ] }, { "cell_type": "code", "execution_count": 4, "id": "6da3a7aa", "metadata": { "execution": { "iopub.execute_input": "2023-04-19T11:49:52.903564Z", "iopub.status.busy": "2023-04-19T11:49:52.903201Z", "iopub.status.idle": "2023-04-19T11:49:52.908824Z", "shell.execute_reply": "2023-04-19T11:49:52.907790Z" }, "tags": [] }, "outputs": [], "source": [ "images = mutils.find_files(path=\"data\", limit=10)" ] }, { "cell_type": "code", "execution_count": 5, "id": "8b32409f", "metadata": { "execution": { "iopub.execute_input": "2023-04-19T11:49:52.913209Z", "iopub.status.busy": "2023-04-19T11:49:52.912933Z", "iopub.status.idle": "2023-04-19T11:49:52.917008Z", "shell.execute_reply": "2023-04-19T11:49:52.916039Z" }, "tags": [] }, "outputs": [], "source": [ "mydict = mutils.initialize_dict(images)" ] }, { "cell_type": "markdown", "id": "7b8b929f", "metadata": {}, "source": [ "## google cloud vision API\n", "First 1000 images per month are free." ] }, { "cell_type": "markdown", "id": "0891b795-c7fe-454c-a45d-45fadf788142", "metadata": {}, "source": [ "## Inspect the elements per image" ] }, { "cell_type": "code", "execution_count": 6, "id": "7c6ecc88", "metadata": { "execution": { "iopub.execute_input": "2023-04-19T11:49:52.920972Z", "iopub.status.busy": "2023-04-19T11:49:52.920597Z", "iopub.status.idle": "2023-04-19T11:49:54.254613Z", "shell.execute_reply": "2023-04-19T11:49:54.253259Z" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "973d6d684eee4c1bab1f93e5c555481a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(Select(layout=Layout(width='20%'), options=('106349S_por', '102730_eng', '102141_2_eng'), rows=…" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mdisplay.explore_analysis(mydict, identify=\"text-on-image\")" ] }, { "cell_type": "markdown", "id": "9c3e72b5-0e57-4019-b45e-3e36a74e7f52", "metadata": {}, "source": [ "## Or directly analyze for further processing" ] }, { "cell_type": "code", "execution_count": 7, "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f", "metadata": { "execution": { "iopub.execute_input": "2023-04-19T11:49:54.267696Z", "iopub.status.busy": "2023-04-19T11:49:54.267404Z", "iopub.status.idle": "2023-04-19T11:50:11.153123Z", "shell.execute_reply": "2023-04-19T11:50:11.151883Z" }, "tags": [] }, "outputs": [], "source": [ "for key in mydict:\n", " mydict[key] = misinformation.text.TextDetector(\n", " mydict[key], analyse_text=True\n", " ).analyse_image()" ] }, { "cell_type": "markdown", "id": "3c063eda", "metadata": {}, "source": [ "## Convert to dataframe and write csv" ] }, { "cell_type": "code", "execution_count": 8, "id": "5709c2cd", "metadata": { "execution": { "iopub.execute_input": "2023-04-19T11:50:11.157971Z", "iopub.status.busy": "2023-04-19T11:50:11.157612Z", "iopub.status.idle": "2023-04-19T11:50:11.163241Z", "shell.execute_reply": "2023-04-19T11:50:11.162440Z" }, "tags": [] }, "outputs": [], "source": [ "outdict = mutils.append_data_to_dict(mydict)\n", "df = mutils.dump_df(outdict)" ] }, { "cell_type": "code", "execution_count": 9, "id": "c4f05637", "metadata": { "execution": { "iopub.execute_input": "2023-04-19T11:50:11.166902Z", "iopub.status.busy": "2023-04-19T11:50:11.166638Z", "iopub.status.idle": "2023-04-19T11:50:11.182368Z", "shell.execute_reply": "2023-04-19T11:50:11.181369Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
| \n", " | filename | \n", "text | \n", "text_language | \n", "text_english | \n", "text_clean | \n", "text_english_correct | \n", "polarity | \n", "subjectivity | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "data/106349S_por.png | \n", "NEWS URGENTE SAMSUNG AO VIVO Rio de Janeiro NO... | \n", "pt | \n", "NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | \n", "NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | \n", "NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | \n", "-0.106818 | \n", "0.588636 | \n", "
| 1 | \n", "data/102730_eng.png | \n", "400 DEATHS GET E-BOOK X AN Corporation ncy Ser... | \n", "en | \n", "400 DEATHS GET E-BOOK X AN Corporation ncy Ser... | \n", "DEATHS GET E - BOOK X AN Corporation Services ... | \n", "400 DEATHS GET E-BOOK X of Corporation ney Ser... | \n", "-0.125000 | \n", "0.375000 | \n", "
| 2 | \n", "data/102141_2_eng.png | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "en | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "0.000000 | \n", "0.000000 | \n", "