{ "cells": [ { "cell_type": "markdown", "id": "dcaa3da1", "metadata": {}, "source": [ "# Text extraction on image\n", "Inga Ulusoy, SSC, July 2022" ] }, { "cell_type": "code", "execution_count": 1, "id": "f43f327c", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T08:55:09.293084Z", "iopub.status.busy": "2023-05-16T08:55:09.292436Z", "iopub.status.idle": "2023-05-16T08:55:09.305860Z", "shell.execute_reply": "2023-05-16T08:55:09.305058Z" }, "tags": [] }, "outputs": [], "source": [ "# if running on google colab\n", "# flake8-noqa-cell\n", "import os\n", "\n", "if \"google.colab\" in str(get_ipython()):\n", " # update python version\n", " # install setuptools\n", " !pip install setuptools==61 -qqq\n", " # install ammico\n", " !pip install git+https://github.com/ssciwr/ammico.git -qqq\n", " # mount google drive for data and API key\n", " from google.colab import drive\n", "\n", " drive.mount(\"/content/drive\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "cf362e60", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T08:55:09.310291Z", "iopub.status.busy": "2023-05-16T08:55:09.309569Z", "iopub.status.idle": "2023-05-16T08:55:26.492276Z", "shell.execute_reply": "2023-05-16T08:55:26.491489Z" }, "tags": [] }, "outputs": [], "source": [ "import ammico\n", "from ammico import utils as mutils\n", "from ammico import display as mdisplay" ] }, { "cell_type": "code", "execution_count": 3, "id": "27675810", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T08:55:26.497189Z", "iopub.status.busy": "2023-05-16T08:55:26.496344Z", "iopub.status.idle": "2023-05-16T08:55:40.011532Z", "shell.execute_reply": "2023-05-16T08:55:40.010231Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting en-core-web-md==3.5.0\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)\r\n", "\u001b[?25l" ] }, { "name": "stdout", "output_type": "stream", "text": [ " \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/42.8 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.1/42.8 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:19\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.7/42.8 MB\u001b[0m \u001b[31m53.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.7/42.8 MB\u001b[0m \u001b[31m91.8 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/42.8 MB\u001b[0m \u001b[31m171.5 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.9/42.8 MB\u001b[0m \u001b[31m176.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━\u001b[0m \u001b[32m27.9/42.8 MB\u001b[0m \u001b[31m174.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━\u001b[0m \u001b[32m34.0/42.8 MB\u001b[0m \u001b[31m173.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━\u001b[0m \u001b[32m38.8/42.8 MB\u001b[0m \u001b[31m155.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m150.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m150.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m150.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m150.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m45.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n", "\u001b[?25h" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: spacy<3.6.0,>=3.5.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from en-core-web-md==3.5.0) (3.5.3)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.0.12)\r\n", "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.0.4)\r\n", "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.0.9)\r\n", "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.0.7)\r\n", "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.0.8)\r\n", "Requirement already satisfied: thinc<8.2.0,>=8.1.8 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (8.1.10)\r\n", "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.1.1)\r\n", "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.4.6)\r\n", "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.0.8)\r\n", "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.7.0)\r\n", "Requirement already satisfied: pathy>=0.10.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.10.1)\r\n", "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (6.3.0)\r\n", "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (4.65.0)\r\n", "Requirement already satisfied: numpy>=1.15.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.23.4)\r\n", "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.30.0)\r\n", "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.10.7)\r\n", "Requirement already satisfied: jinja2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.1.2)\r\n", "Requirement already satisfied: setuptools in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (58.1.0)\r\n", "Requirement already satisfied: packaging>=20.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (23.1)\r\n", "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.3.0)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (4.5.0)\r\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.1.0)\r\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.10)\r\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.26.15)\r\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2023.5.7)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.7.9)\r\n", "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.0.4)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (8.1.3)\r\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from jinja2->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.1.2)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Installing collected packages: en-core-web-md\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Successfully installed en-core-web-md-3.5.0\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m22.0.4\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\r\n", "You can now load the package via spacy.load('en_core_web_md')\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package brown to /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping corpora/brown.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping tokenizers/punkt.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package averaged_perceptron_tagger to\r\n", "[nltk_data] /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package conll2000 to /home/runner/nltk_data...\r\n", "[nltk_data] Unzipping corpora/conll2000.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package movie_reviews to\r\n", "[nltk_data] /home/runner/nltk_data...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Unzipping corpora/movie_reviews.zip.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Finished.\r\n" ] } ], "source": [ "# download the models if they are not there yet\n", "!python -m spacy download en_core_web_md\n", "!python -m textblob.download_corpora" ] }, { "cell_type": "code", "execution_count": 4, "id": "6da3a7aa", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T08:55:40.016445Z", "iopub.status.busy": "2023-05-16T08:55:40.015897Z", "iopub.status.idle": "2023-05-16T08:55:40.020769Z", "shell.execute_reply": "2023-05-16T08:55:40.020119Z" }, "tags": [] }, "outputs": [], "source": [ "images = mutils.find_files(path=\"data\", limit=10)" ] }, { "cell_type": "code", "execution_count": 5, "id": "8b32409f", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T08:55:40.024660Z", "iopub.status.busy": "2023-05-16T08:55:40.024103Z", "iopub.status.idle": "2023-05-16T08:55:40.029378Z", "shell.execute_reply": "2023-05-16T08:55:40.028614Z" }, "tags": [] }, "outputs": [], "source": [ "mydict = mutils.initialize_dict(images)" ] }, { "cell_type": "markdown", "id": "7b8b929f", "metadata": {}, "source": [ "## google cloud vision API\n", "First 1000 images per month are free." ] }, { "cell_type": "markdown", "id": "0891b795-c7fe-454c-a45d-45fadf788142", "metadata": {}, "source": [ "## Inspect the elements per image" ] }, { "cell_type": "code", "execution_count": 6, "id": "7c6ecc88", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T08:55:40.032984Z", "iopub.status.busy": "2023-05-16T08:55:40.032400Z", "iopub.status.idle": "2023-05-16T08:55:41.419926Z", "shell.execute_reply": "2023-05-16T08:55:41.419071Z" }, "tags": [] }, "outputs": [ { "ename": "AttributeError", "evalue": "module 'ammico.display' has no attribute 'explore_analysis'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mmdisplay\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexplore_analysis\u001b[49m(mydict, identify\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext-on-image\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "\u001b[0;31mAttributeError\u001b[0m: module 'ammico.display' has no attribute 'explore_analysis'" ] } ], "source": [ "mdisplay.explore_analysis(mydict, identify=\"text-on-image\")" ] }, { "cell_type": "markdown", "id": "9c3e72b5-0e57-4019-b45e-3e36a74e7f52", "metadata": {}, "source": [ "## Or directly analyze for further processing" ] }, { "cell_type": "code", "execution_count": 7, "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T08:55:41.423867Z", "iopub.status.busy": "2023-05-16T08:55:41.423329Z", "iopub.status.idle": "2023-05-16T08:56:59.076383Z", "shell.execute_reply": "2023-05-16T08:56:59.068404Z" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9c6335e81a714abc82ebcc41f68a6ed3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)/a4f8f3e/config.json: 0%| | 0.00/1.80k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c48876385e164469ae93bd899744fc1f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading pytorch_model.bin: 0%| | 0.00/1.22G [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6a04a643e5ac4efbb8613aed3ae04a09", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)okenizer_config.json: 0%| | 0.00/26.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "422b6c6aa97148489d5a707c422fcc83", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)e/a4f8f3e/vocab.json: 0%| | 0.00/899k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8760f156ee6c4b01a3fb8bb513dad93d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)e/a4f8f3e/merges.txt: 0%| | 0.00/456k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8b363d1bc7e84480a9c2de83d23018e1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)/af0f99b/config.json: 0%| | 0.00/629 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "06fe271bf0f34ddc839c2e97fb1df48a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading pytorch_model.bin: 0%| | 0.00/268M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5bb7ee5d8f174c1f89e66c608e18b09f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)okenizer_config.json: 0%| | 0.00/48.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9c8df02c5cf9402ea8a42a86c80ca7bb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)ve/af0f99b/vocab.txt: 0%| | 0.00/232k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a300fba1b7294b88820174cc2836c917", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)/f2482bf/config.json: 0%| | 0.00/998 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1d5c2c6b5f7c4eee9816f5734eea00d7", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading pytorch_model.bin: 0%| | 0.00/1.33G [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7f3d4c2ae78a4aa5b810d114643a1312", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)okenizer_config.json: 0%| | 0.00/60.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "940b6df8a53c4a9da58fe9dd105e3db1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)ve/f2482bf/vocab.txt: 0%| | 0.00/213k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for key in mydict:\n", " mydict[key] = ammico.text.TextDetector(\n", " mydict[key], analyse_text=True\n", " ).analyse_image()" ] }, { "cell_type": "markdown", "id": "3c063eda", "metadata": {}, "source": [ "## Convert to dataframe and write csv" ] }, { "cell_type": "code", "execution_count": 8, "id": "5709c2cd", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T08:56:59.109002Z", "iopub.status.busy": "2023-05-16T08:56:59.108315Z", "iopub.status.idle": "2023-05-16T08:56:59.206624Z", "shell.execute_reply": "2023-05-16T08:56:59.205776Z" }, "tags": [] }, "outputs": [], "source": [ "outdict = mutils.append_data_to_dict(mydict)\n", "df = mutils.dump_df(outdict)" ] }, { "cell_type": "code", "execution_count": 9, "id": "c4f05637", "metadata": { "execution": { "iopub.execute_input": "2023-05-16T08:56:59.210332Z", "iopub.status.busy": "2023-05-16T08:56:59.209802Z", "iopub.status.idle": "2023-05-16T08:56:59.326083Z", "shell.execute_reply": "2023-05-16T08:56:59.325214Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
| \n", " | filename | \n", "text | \n", "text_language | \n", "text_english | \n", "text_clean | \n", "text_english_correct | \n", "polarity | \n", "subjectivity | \n", "text_summary | \n", "sentiment | \n", "sentiment_score | \n", "entity | \n", "entity_type | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "data/102730_eng.png | \n", "400 DEATHS GET E-BOOK X AN Corporation ncy Ser... | \n", "en | \n", "400 DEATHS GET E-BOOK X AN Corporation ncy Ser... | \n", "DEATHS GET E - BOOK X AN Corporation Services ... | \n", "400 DEATHS GET E-BOOK X of Corporation ney Ser... | \n", "-0.125000 | \n", "0.375000 | \n", "A municipal worker sprays disinfectant on his... | \n", "NEGATIVE | \n", "0.991692 | \n", "[AN Corporation ncy Services, Ahmedabad, RE, #... | \n", "[ORG, LOC, PER, ORG] | \n", "
| 1 | \n", "data/102141_2_eng.png | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "en | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | \n", "0.000000 | \n", "0.000000 | \n", "Coronavirus QUARANTINE CORONAVIRUS OUTBREAK | \n", "NEGATIVE | \n", "0.976247 | \n", "[CORONAVIRUS, ##AR, ##TI, ##RONAVIR, ##C, Co] | \n", "[ORG, MISC, MISC, ORG, MISC, MISC] | \n", "
| 2 | \n", "data/106349S_por.png | \n", "NEWS URGENTE SAMSUNG AO VIVO Rio de Janeiro NO... | \n", "pt | \n", "NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | \n", "NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | \n", "NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | \n", "-0.106818 | \n", "0.588636 | \n", "NEW COUNTING METHOD RJ City HALL EXCLUDES 1,1... | \n", "NEGATIVE | \n", "0.990659 | \n", "[Rio de Janeiro, C, ##IT, P, ##NA, ##LTO] | \n", "[LOC, ORG, LOC, LOC, ORG, LOC] | \n", "