{ "cells": [ { "cell_type": "markdown", "id": "dcaa3da1", "metadata": {}, "source": [ "# Text extraction on image\n", "Inga Ulusoy, SSC, July 2022" ] }, { "cell_type": "code", "execution_count": 1, "id": "f43f327c", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T11:49:21.550942Z", "iopub.status.busy": "2023-05-16T11:49:21.550430Z", "iopub.status.idle": "2023-05-16T11:49:21.560102Z", "shell.execute_reply": "2023-05-16T11:49:21.559450Z" }, "tags": [] }, "outputs": [], "source": [ "# if running on google colab\n", "# flake8-noqa-cell\n", "import os\n", "\n", "if \"google.colab\" in str(get_ipython()):\n", " # update python version\n", " # install setuptools\n", " !pip install setuptools==61 -qqq\n", " # install ammico\n", " !pip install git+https://github.com/ssciwr/ammico.git -qqq\n", " # mount google drive for data and API key\n", " from google.colab import drive\n", "\n", " drive.mount(\"/content/drive\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "cf362e60", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T11:49:21.563451Z", "iopub.status.busy": "2023-05-16T11:49:21.563002Z", "iopub.status.idle": "2023-05-16T11:49:39.517727Z", "shell.execute_reply": "2023-05-16T11:49:39.517014Z" }, "tags": [] }, "outputs": [], "source": [ "import ammico\n", "from ammico import utils as mutils\n", "from ammico import display as mdisplay" ] }, { "cell_type": "code", "execution_count": 3, "id": "27675810", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T11:49:39.522426Z", "iopub.status.busy": "2023-05-16T11:49:39.521660Z", "iopub.status.idle": "2023-05-16T11:49:51.930718Z", "shell.execute_reply": "2023-05-16T11:49:51.929174Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting en-core-web-md==3.5.0\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)\r\n", "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/42.8 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.1/42.8 MB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:22\u001b[0m\r", "\u001b[2K \u001b[91m━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/42.8 MB\u001b[0m \u001b[31m30.4 MB/s\u001b[0m eta \u001b[36m0:00:02\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.7/42.8 MB\u001b[0m \u001b[31m82.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.0/42.8 MB\u001b[0m \u001b[31m182.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.0/42.8 MB\u001b[0m \u001b[31m174.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━\u001b[0m \u001b[32m27.3/42.8 MB\u001b[0m \u001b[31m181.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━\u001b[0m \u001b[32m31.6/42.8 MB\u001b[0m \u001b[31m150.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━\u001b[0m \u001b[32m36.7/42.8 MB\u001b[0m \u001b[31m134.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m166.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m166.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m166.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m166.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m49.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n", "\u001b[?25h" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: spacy<3.6.0,>=3.5.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from en-core-web-md==3.5.0) (3.5.3)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.0.12)\r\n", "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.0.4)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.0.9)\r\n", "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.0.7)\r\n", "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.0.8)\r\n", "Requirement already satisfied: thinc<8.2.0,>=8.1.8 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (8.1.10)\r\n", "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.1.1)\r\n", "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.4.6)\r\n", "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.0.8)\r\n", "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.7.0)\r\n", "Requirement already satisfied: pathy>=0.10.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.10.1)\r\n", "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (6.3.0)\r\n", "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (4.65.0)\r\n", "Requirement already satisfied: numpy>=1.15.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.23.4)\r\n", "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.30.0)\r\n", "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.10.7)\r\n", "Requirement already satisfied: jinja2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.1.2)\r\n", "Requirement already satisfied: setuptools in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (58.1.0)\r\n", "Requirement already satisfied: packaging>=20.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (23.1)\r\n", "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.3.0)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (4.5.0)\r\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.1.0)\r\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.10)\r\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.26.15)\r\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2023.5.7)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.7.9)\r\n", "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.0.4)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (8.1.3)\r\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from jinja2->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.1.2)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Installing collected packages: en-core-web-md\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Successfully installed en-core-web-md-3.5.0\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m22.0.4\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\r\n", "You can now load the package via spacy.load('en_core_web_md')\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package brown to /home/runner/nltk_data...\r\n", "[nltk_data] Unzipping corpora/brown.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping tokenizers/punkt.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package averaged_perceptron_tagger to\r\n", "[nltk_data] /home/runner/nltk_data...\r\n", "[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package conll2000 to /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping corpora/conll2000.zip.\r\n", "[nltk_data] Downloading package movie_reviews to\r\n", "[nltk_data] /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping corpora/movie_reviews.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Finished.\r\n" ] } ], "source": [ "# download the models if they are not there yet\n", "!python -m spacy download en_core_web_md\n", "!python -m textblob.download_corpora" ] }, { "cell_type": "code", "execution_count": 4, "id": "6da3a7aa", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T11:49:51.935914Z", "iopub.status.busy": "2023-05-16T11:49:51.935219Z", "iopub.status.idle": "2023-05-16T11:49:51.941523Z", "shell.execute_reply": "2023-05-16T11:49:51.940278Z" }, "tags": [] }, "outputs": [], "source": [ "images = mutils.find_files(path=\"data\", limit=10)" ] }, { "cell_type": "code", "execution_count": 5, "id": "8b32409f", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T11:49:51.944599Z", "iopub.status.busy": "2023-05-16T11:49:51.944364Z", "iopub.status.idle": "2023-05-16T11:49:51.948756Z", "shell.execute_reply": "2023-05-16T11:49:51.948085Z" }, "tags": [] }, "outputs": [], "source": [ "mydict = mutils.initialize_dict(images)" ] }, { "cell_type": "markdown", "id": "7b8b929f", "metadata": {}, "source": [ "## google cloud vision API\n", "First 1000 images per month are free." ] }, { "cell_type": "markdown", "id": "0891b795-c7fe-454c-a45d-45fadf788142", "metadata": {}, "source": [ "## Inspect the elements per image" ] }, { "cell_type": "code", "execution_count": 6, "id": "7c6ecc88", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T11:49:51.951964Z", "iopub.status.busy": "2023-05-16T11:49:51.951516Z", "iopub.status.idle": "2023-05-16T11:49:53.151467Z", "shell.execute_reply": "2023-05-16T11:49:53.150719Z" }, "tags": [] }, "outputs": [ { "ename": "AttributeError", "evalue": "module 'ammico.display' has no attribute 'explore_analysis'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mmdisplay\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexplore_analysis\u001b[49m(mydict, identify\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext-on-image\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "\u001b[0;31mAttributeError\u001b[0m: module 'ammico.display' has no attribute 'explore_analysis'" ] } ], "source": [ "mdisplay.explore_analysis(mydict, identify=\"text-on-image\")" ] }, { "cell_type": "markdown", "id": "9c3e72b5-0e57-4019-b45e-3e36a74e7f52", "metadata": {}, "source": [ "## Or directly analyze for further processing" ] }, { "cell_type": "code", "execution_count": 7, "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T11:49:53.155205Z", "iopub.status.busy": "2023-05-16T11:49:53.154717Z", "iopub.status.idle": "2023-05-16T11:51:06.481210Z", "shell.execute_reply": "2023-05-16T11:51:06.463358Z" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4d504848c8ce48ca82cbaef171095615", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)/a4f8f3e/config.json: 0%| | 0.00/1.80k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1886f4a17817483ebac13ba22942fc9e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading pytorch_model.bin: 0%| | 0.00/1.22G [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d1cbbad172ae466bb035e82155234e01", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)okenizer_config.json: 0%| | 0.00/26.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a8568975ae2f4d7db4f111154cac9119", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)e/a4f8f3e/vocab.json: 0%| | 0.00/899k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4aff94e56ab14d4cad36f1cbd0e73e63", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)e/a4f8f3e/merges.txt: 0%| | 0.00/456k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "400 DEATHS GET E-BOOK X AN Corporation ncy Services A municipal worker sprays disinfectant on his colleague after they cremated the body of a man who died due to COVID-19 at a crematorium in Ahmedabad on April 12, 2020. | Photo Credit: REUTERS\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "323faa9445d848fbaed7d3f7a85e9ab2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)/af0f99b/config.json: 0%| | 0.00/629 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0c31e9b64b344de9ace4f6e45760f167", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading pytorch_model.bin: 0%| | 0.00/268M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8cba80aeb0cd4c909da2b33fa44f985e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)okenizer_config.json: 0%| | 0.00/48.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "bb65cc354f8b4f1db8b31a56becfdfad", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)ve/af0f99b/vocab.txt: 0%| | 0.00/232k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "eca6a8a981a94a6faec57bb11e32607f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)/f2482bf/config.json: 0%| | 0.00/998 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6794c03c21554a1aab1702c5603cb56d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading pytorch_model.bin: 0%| | 0.00/1.33G [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5006c872d9e045d89813fe8778492fe0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)okenizer_config.json: 0%| | 0.00/60.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "915d45a8e5eb49ac82b4f5b06a1ea212", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)ve/f2482bf/vocab.txt: 0%| | 0.00/213k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BEYOND THIS POINT 2019-nCov Coronavirus\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW COUNTING METHOD RJ CITY HALL EXCLUDES 1,177 DEAD FROM COVID-19 STATISTICS G1 ARE TARGET OF PF OPERATION AGAINST FAKE NEWS OPERATION IS SEEN ON THE PLANALTO AT 10:09\n" ] } ], "source": [ "for key in mydict:\n", " mydict[key] = ammico.text.TextDetector(\n", " mydict[key], analyse_text=True\n", " ).analyse_image()" ] }, { "cell_type": "markdown", "id": "3c063eda", "metadata": {}, "source": [ "## Convert to dataframe and write csv" ] }, { "cell_type": "code", "execution_count": 8, "id": "5709c2cd", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T11:51:06.558843Z", "iopub.status.busy": "2023-05-16T11:51:06.558477Z", "iopub.status.idle": "2023-05-16T11:51:06.786646Z", "shell.execute_reply": "2023-05-16T11:51:06.785843Z" }, "tags": [] }, "outputs": [], "source": [ "outdict = mutils.append_data_to_dict(mydict)\n", "df = mutils.dump_df(outdict)" ] }, { "cell_type": "code", "execution_count": 9, "id": "c4f05637", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T11:51:06.791184Z", "iopub.status.busy": "2023-05-16T11:51:06.790679Z", "iopub.status.idle": "2023-05-16T11:51:07.074714Z", "shell.execute_reply": "2023-05-16T11:51:07.072930Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
| \n", " | filename | \n", "text | \n", "text_language | \n", "text_english | \n", "text_clean | \n", "text_english_correct | \n", "polarity | \n", "subjectivity | \n", "text_summary | \n", "sentiment | \n", "sentiment_score | \n", "entity | \n", "entity_type | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "data/102730_eng.png | \n", "400 DEATHS GET E-BOOK X AN Corporation ncy Ser... | \n", "en | \n", "400 DEATHS GET E-BOOK X AN Corporation ncy Ser... | \n", "DEATHS GET E - BOOK X AN Corporation Services ... | \n", "400 DEATHS GET E-BOOK X of Corporation ney Ser... | \n", "-0.125000 | \n", "0.375000 | \n", "A municipal worker sprays disinfectant on his... | \n", "NEGATIVE | \n", "0.991692 | \n", "[AN Corporation ncy Services, Ahmedabad, RE, #... | \n", "[ORG, LOC, PER, ORG] | \n", "
| 1 | \n", "data/102141_2_eng.png | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "en | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "0.000000 | \n", "0.000000 | \n", "Coronavirus QUARANTINE CORONAVIRUS OUTBREAK | \n", "NEGATIVE | \n", "0.976247 | \n", "[CORONAVIRUS, ##AR, ##TI, ##RONAVIR, ##C, Co] | \n", "[ORG, MISC, MISC, ORG, MISC, MISC] | \n", "
| 2 | \n", "data/106349S_por.png | \n", "NEWS URGENTE SAMSUNG AO VIVO Rio de Janeiro NO... | \n", "pt | \n", "NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | \n", "NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | \n", "NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | \n", "-0.106818 | \n", "0.588636 | \n", "NEW COUNTING METHOD RJ City HALL EXCLUDES 1,1... | \n", "NEGATIVE | \n", "0.990659 | \n", "[Rio de Janeiro, C, ##IT, P, ##NA, ##LTO] | \n", "[LOC, ORG, LOC, LOC, ORG, LOC] | \n", "