{ "cells": [ { "cell_type": "markdown", "id": "dcaa3da1", "metadata": {}, "source": [ "# Text extraction on image\n", "Inga Ulusoy, SSC, July 2022" ] }, { "cell_type": "code", "execution_count": 1, "id": "f43f327c", "metadata": { "execution": { "iopub.execute_input": "2023-05-05T10:01:04.542421Z", "iopub.status.busy": "2023-05-05T10:01:04.541965Z", "iopub.status.idle": "2023-05-05T10:01:04.553323Z", "shell.execute_reply": "2023-05-05T10:01:04.552546Z" }, "tags": [] }, "outputs": [], "source": [ "# if running on google colab\n", "# flake8-noqa-cell\n", "import os\n", "\n", "if \"google.colab\" in str(get_ipython()):\n", " # update python version\n", " # install setuptools\n", " !pip install setuptools==61 -qqq\n", " # install ammico\n", " !pip install git+https://github.com/ssciwr/ammico.git -qqq\n", " # mount google drive for data and API key\n", " from google.colab import drive\n", "\n", " drive.mount(\"/content/drive\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "cf362e60", "metadata": { "execution": { "iopub.execute_input": "2023-05-05T10:01:04.558904Z", "iopub.status.busy": "2023-05-05T10:01:04.558220Z", "iopub.status.idle": "2023-05-05T10:01:26.569199Z", "shell.execute_reply": "2023-05-05T10:01:26.568373Z" }, "tags": [] }, "outputs": [], "source": [ "import ammico\n", "from ammico import utils as mutils\n", "from ammico import display as mdisplay" ] }, { "cell_type": "code", "execution_count": 3, "id": "27675810", "metadata": { "execution": { "iopub.execute_input": "2023-05-05T10:01:26.574359Z", "iopub.status.busy": "2023-05-05T10:01:26.573435Z", "iopub.status.idle": "2023-05-05T10:01:41.314936Z", "shell.execute_reply": "2023-05-05T10:01:41.313575Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting en-core-web-md==3.5.0\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)\r\n", "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/42.8 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.1/42.8 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:19\u001b[0m\r", "\u001b[2K \u001b[91m━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/42.8 MB\u001b[0m \u001b[31m25.9 MB/s\u001b[0m eta \u001b[36m0:00:02\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.6/42.8 MB\u001b[0m \u001b[31m62.5 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.4/42.8 MB\u001b[0m \u001b[31m131.3 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.3/42.8 MB\u001b[0m \u001b[31m134.3 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.9/42.8 MB\u001b[0m \u001b[31m147.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━\u001b[0m \u001b[32m27.6/42.8 MB\u001b[0m \u001b[31m163.8 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━\u001b[0m \u001b[32m33.5/42.8 MB\u001b[0m \u001b[31m164.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━\u001b[0m \u001b[32m39.2/42.8 MB\u001b[0m \u001b[31m163.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m156.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m156.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m156.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m156.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m44.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n", "\u001b[?25h" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: spacy<3.6.0,>=3.5.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from en-core-web-md==3.5.0) (3.5.2)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.0.12)\r\n", "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.0.4)\r\n", "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.0.9)\r\n", "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.0.7)\r\n", "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.0.8)\r\n", "Requirement already satisfied: thinc<8.2.0,>=8.1.8 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (8.1.10)\r\n", "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.1.1)\r\n", "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.4.6)\r\n", "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.0.8)\r\n", "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.7.0)\r\n", "Requirement already satisfied: pathy>=0.10.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.10.1)\r\n", "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (6.3.0)\r\n", "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (4.65.0)\r\n", "Requirement already satisfied: numpy>=1.15.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.23.4)\r\n", "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.30.0)\r\n", "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.10.7)\r\n", "Requirement already satisfied: jinja2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.1.2)\r\n", "Requirement already satisfied: setuptools in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (58.1.0)\r\n", "Requirement already satisfied: packaging>=20.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (23.1)\r\n", "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.3.0)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (4.5.0)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.1.0)\r\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.10)\r\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.0.2)\r\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2022.12.7)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.7.9)\r\n", "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.0.4)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (8.1.3)\r\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from jinja2->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.1.2)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Installing collected packages: en-core-web-md\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Successfully installed en-core-web-md-3.5.0\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m22.0.4\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\r\n", "You can now load the package via spacy.load('en_core_web_md')\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package brown to /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping corpora/brown.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping tokenizers/punkt.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package averaged_perceptron_tagger to\r\n", "[nltk_data] /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package conll2000 to /home/runner/nltk_data...\r\n", "[nltk_data] Unzipping corpora/conll2000.zip.\r\n", "[nltk_data] Downloading package movie_reviews to\r\n", "[nltk_data] /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping corpora/movie_reviews.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Finished.\r\n" ] } ], "source": [ "# download the models if they are not there yet\n", "!python -m spacy download en_core_web_md\n", "!python -m textblob.download_corpora" ] }, { "cell_type": "code", "execution_count": 4, "id": "6da3a7aa", "metadata": { "execution": { "iopub.execute_input": "2023-05-05T10:01:41.320366Z", "iopub.status.busy": "2023-05-05T10:01:41.319795Z", "iopub.status.idle": "2023-05-05T10:01:41.326125Z", "shell.execute_reply": "2023-05-05T10:01:41.324964Z" }, "tags": [] }, "outputs": [], "source": [ "images = mutils.find_files(path=\"data\", limit=10)" ] }, { "cell_type": "code", "execution_count": 5, "id": "8b32409f", "metadata": { "execution": { "iopub.execute_input": "2023-05-05T10:01:41.329644Z", "iopub.status.busy": "2023-05-05T10:01:41.329115Z", "iopub.status.idle": "2023-05-05T10:01:41.333110Z", "shell.execute_reply": "2023-05-05T10:01:41.332309Z" }, "tags": [] }, "outputs": [], "source": [ "mydict = mutils.initialize_dict(images)" ] }, { "cell_type": "markdown", "id": "7b8b929f", "metadata": {}, "source": [ "## google cloud vision API\n", "First 1000 images per month are free." ] }, { "cell_type": "markdown", "id": "0891b795-c7fe-454c-a45d-45fadf788142", "metadata": {}, "source": [ "## Inspect the elements per image" ] }, { "cell_type": "code", "execution_count": 6, "id": "7c6ecc88", "metadata": { "execution": { "iopub.execute_input": "2023-05-05T10:01:41.338083Z", "iopub.status.busy": "2023-05-05T10:01:41.337532Z", "iopub.status.idle": "2023-05-05T10:01:42.735422Z", "shell.execute_reply": "2023-05-05T10:01:42.734302Z" }, "tags": [] }, "outputs": [ { "ename": "AttributeError", "evalue": "module 'ammico.display' has no attribute 'explore_analysis'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mmdisplay\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexplore_analysis\u001b[49m(mydict, identify\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext-on-image\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "\u001b[0;31mAttributeError\u001b[0m: module 'ammico.display' has no attribute 'explore_analysis'" ] } ], "source": [ "mdisplay.explore_analysis(mydict, identify=\"text-on-image\")" ] }, { "cell_type": "markdown", "id": "9c3e72b5-0e57-4019-b45e-3e36a74e7f52", "metadata": {}, "source": [ "## Or directly analyze for further processing" ] }, { "cell_type": "code", "execution_count": 7, "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f", "metadata": { "execution": { "iopub.execute_input": "2023-05-05T10:01:42.739830Z", "iopub.status.busy": "2023-05-05T10:01:42.739347Z", "iopub.status.idle": "2023-05-05T10:03:50.848629Z", "shell.execute_reply": "2023-05-05T10:03:50.818575Z" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "db7fa2ba68f84bb588323015a0b87c4f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)/a4f8f3e/config.json: 0%| | 0.00/1.80k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "83a0fe245c7f40c1a416efb7f9fb4cf5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading pytorch_model.bin: 0%| | 0.00/1.22G [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "42fc763567b749238871cfcbf5dadbb8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)okenizer_config.json: 0%| | 0.00/26.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5f2acce190ca40fcbcf45834550891e9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)e/a4f8f3e/vocab.json: 0%| | 0.00/899k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0a6a5ce369c84f7e90b65d2cbefb14e7", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)e/a4f8f3e/merges.txt: 0%| | 0.00/456k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6c8fc519b43a4dabba8fae19fd1405d3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)/af0f99b/config.json: 0%| | 0.00/629 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "52cf7df7904244c2911e03e823dd8eb4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading pytorch_model.bin: 0%| | 0.00/268M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "aed01c97a4964a08ad718fd73d12ba7f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)okenizer_config.json: 0%| | 0.00/48.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "058dfd2c58ce4b81ab1d92e4f8452ccb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)ve/af0f99b/vocab.txt: 0%| | 0.00/232k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d388865e2a854bffbcf2977af9b65fd5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)/f2482bf/config.json: 0%| | 0.00/998 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "903a2ee7edcc47178af75db85eedd2b5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading pytorch_model.bin: 0%| | 0.00/1.33G [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c7c30398eced468ebd0670ebf0818ef0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)okenizer_config.json: 0%| | 0.00/60.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a26987ed1a434f2290aed8a3a4689a26", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)ve/f2482bf/vocab.txt: 0%| | 0.00/213k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for key in mydict:\n", " mydict[key] = ammico.text.TextDetector(\n", " mydict[key], analyse_text=True\n", " ).analyse_image()" ] }, { "cell_type": "markdown", "id": "3c063eda", "metadata": {}, "source": [ "## Convert to dataframe and write csv" ] }, { "cell_type": "code", "execution_count": 8, "id": "5709c2cd", "metadata": { "execution": { "iopub.execute_input": "2023-05-05T10:03:50.941867Z", "iopub.status.busy": "2023-05-05T10:03:50.940645Z", "iopub.status.idle": "2023-05-05T10:03:51.207695Z", "shell.execute_reply": "2023-05-05T10:03:51.206592Z" }, "tags": [] }, "outputs": [], "source": [ "outdict = mutils.append_data_to_dict(mydict)\n", "df = mutils.dump_df(outdict)" ] }, { "cell_type": "code", "execution_count": 9, "id": "c4f05637", "metadata": { "execution": { "iopub.execute_input": "2023-05-05T10:03:51.213169Z", "iopub.status.busy": "2023-05-05T10:03:51.212560Z", "iopub.status.idle": "2023-05-05T10:03:51.520499Z", "shell.execute_reply": "2023-05-05T10:03:51.519220Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
| \n", " | filename | \n", "text | \n", "text_language | \n", "text_english | \n", "text_clean | \n", "text_english_correct | \n", "polarity | \n", "subjectivity | \n", "text_summary | \n", "sentiment | \n", "sentiment_score | \n", "entity | \n", "entity_type | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "data/102141_2_eng.png | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "en | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "0.000000 | \n", "0.000000 | \n", "Coronavirus QUARANTINE CORONAVIRUS OUTBREAK | \n", "NEGATIVE | \n", "0.976247 | \n", "[CORONAVIRUS, ##AR, ##TI, ##RONAVIR, ##C, Co] | \n", "[ORG, MISC, MISC, ORG, MISC, MISC] | \n", "
| 1 | \n", "data/106349S_por.png | \n", "NEWS URGENTE SAMSUNG AO VIVO Rio de Janeiro NO... | \n", "pt | \n", "NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | \n", "NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | \n", "NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | \n", "-0.106818 | \n", "0.588636 | \n", "NEW COUNTING METHOD RJ City HALL EXCLUDES 1,1... | \n", "NEGATIVE | \n", "0.990659 | \n", "[Rio de Janeiro, C, ##IT, P, ##NA, ##LTO] | \n", "[LOC, ORG, LOC, LOC, ORG, LOC] | \n", "
| 2 | \n", "data/102730_eng.png | \n", "400 DEATHS GET E-BOOK X AN Corporation ncy Ser... | \n", "en | \n", "400 DEATHS GET E-BOOK X AN Corporation ncy Ser... | \n", "DEATHS GET E - BOOK X AN Corporation Services ... | \n", "400 DEATHS GET E-BOOK X of Corporation ney Ser... | \n", "-0.125000 | \n", "0.375000 | \n", "A municipal worker sprays disinfectant on his... | \n", "NEGATIVE | \n", "0.991692 | \n", "[AN Corporation ncy Services, Ahmedabad, RE, #... | \n", "[ORG, LOC, PER, ORG] | \n", "