Text extraction on image
Inga Ulusoy, SSC, July 2022
[1]:
# if running on google colab
# flake8-noqa-cell
import os
if "google.colab" in str(get_ipython()):
# update python version
# install setuptools
!pip install setuptools==61 -qqq
# install ammico
!pip install git+https://github.com/ssciwr/ammico.git -qqq
# mount google drive for data and API key
from google.colab import drive
drive.mount("/content/drive")
[2]:
import ammico
from ammico import utils as mutils
from ammico import display as mdisplay
[3]:
# download the models if they are not there yet
!python -m spacy download en_core_web_md
!python -m textblob.download_corpora
Collecting en-core-web-md==3.5.0
Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.8/42.8 MB 45.5 MB/s eta 0:00:00
Requirement already satisfied: spacy<3.6.0,>=3.5.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from en-core-web-md==3.5.0) (3.5.3)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.0.12)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.0.4)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.0.9)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.0.7)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.0.8)
Requirement already satisfied: thinc<8.2.0,>=8.1.8 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (8.1.10)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.1.1)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.4.6)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.0.8)
Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.7.0)
Requirement already satisfied: pathy>=0.10.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.10.1)
Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (6.3.0)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (4.65.0)
Requirement already satisfied: numpy>=1.15.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.23.4)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.30.0)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.10.7)
Requirement already satisfied: jinja2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.1.2)
Requirement already satisfied: setuptools in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (58.1.0)
Requirement already satisfied: packaging>=20.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (23.1)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.3.0)
Requirement already satisfied: typing-extensions>=4.2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (4.5.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.1.0)
Requirement already satisfied: idna<4,>=2.5 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.26.15)
Requirement already satisfied: certifi>=2017.4.17 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2023.5.7)
Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.7.9)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.0.4)
Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (8.1.3)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages (from jinja2->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.1.2)
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[notice] A new release of pip is available: 22.0.4 -> 23.1.2
[notice] To update, run: pip install --upgrade pip
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_md')
[nltk_data] Downloading package brown to /home/runner/nltk_data...
[nltk_data] Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /home/runner/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/runner/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] /home/runner/nltk_data...
[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /home/runner/nltk_data...
[nltk_data] Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data] /home/runner/nltk_data...
[nltk_data] Unzipping corpora/movie_reviews.zip.
Finished.
[4]:
images = mutils.find_files(path="data", limit=10)
[5]:
mydict = mutils.initialize_dict(images)
google cloud vision API
First 1000 images per month are free.
Inspect the elements per image
[6]:
mdisplay.explore_analysis(mydict, identify="text-on-image")
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[6], line 1
----> 1 mdisplay.explore_analysis(mydict, identify="text-on-image")
AttributeError: module 'ammico.display' has no attribute 'explore_analysis'
Or directly analyze for further processing
[7]:
for key in mydict:
mydict[key] = ammico.text.TextDetector(
mydict[key], analyse_text=True
).analyse_image()
Convert to dataframe and write csv
[8]:
outdict = mutils.append_data_to_dict(mydict)
df = mutils.dump_df(outdict)
[9]:
# check the dataframe
df.head(10)
[9]:
| filename | text | text_language | text_english | text_clean | text_english_correct | polarity | subjectivity | text_summary | sentiment | sentiment_score | entity | entity_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | data/102730_eng.png | 400 DEATHS GET E-BOOK X AN Corporation ncy Ser... | en | 400 DEATHS GET E-BOOK X AN Corporation ncy Ser... | DEATHS GET E - BOOK X AN Corporation Services ... | 400 DEATHS GET E-BOOK X of Corporation ney Ser... | -0.125000 | 0.375000 | A municipal worker sprays disinfectant on his... | NEGATIVE | 0.991692 | [AN Corporation ncy Services, Ahmedabad, RE, #... | [ORG, LOC, PER, ORG] |
| 1 | data/102141_2_eng.png | CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | en | CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE... | 0.000000 | 0.000000 | Coronavirus QUARANTINE CORONAVIRUS OUTBREAK | NEGATIVE | 0.976247 | [CORONAVIRUS, ##AR, ##TI, ##RONAVIR, ##C, Co] | [ORG, MISC, MISC, ORG, MISC, MISC] |
| 2 | data/106349S_por.png | NEWS URGENTE SAMSUNG AO VIVO Rio de Janeiro NO... | pt | NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO... | -0.106818 | 0.588636 | NEW COUNTING METHOD RJ City HALL EXCLUDES 1,1... | NEGATIVE | 0.990659 | [Rio de Janeiro, C, ##IT, P, ##NA, ##LTO] | [LOC, ORG, LOC, LOC, ORG, LOC] |
[10]:
# Write the csv
df.to_csv("./data_out.csv")
[ ]: