# Text extraction on image
Inga Ulusoy, SSC, July 2022

In [None]:
# if running on google colab
# flake8-noqa-cell
import os

if "google.colab" in str(get_ipython()):
    # update python version
    # install setuptools
    !pip install setuptools==61 -qqq
    # install misinformation
    !pip install git+https://github.com/ssciwr/misinformation.git -qqq
    # mount google drive for data and API key
    from google.colab import drive

    drive.mount("/content/drive")

In [None]:
import os
from IPython.display import Image, display
import misinformation
from misinformation import utils as mutils
from misinformation import display as mdisplay
import tensorflow as tf

In [None]:
# download the models if they are not there yet
!python -m spacy download en_core_web_md
!python -m textblob.download_corpora

In [None]:
images = mutils.find_files(path="data", limit=10)

In [None]:
for i in images:
    display(Image(filename=i))

In [None]:
mydict = mutils.initialize_dict(images)

## google cloud vision API
First 1000 images per month are free.

## Inspect the elements per image

In [None]:
mdisplay.explore_analysis(mydict, identify="text-on-image")

## Or directly analyze for further processing

In [None]:
for key in mydict:
    print(key)
    mydict[key] = misinformation.text.TextDetector(
        mydict[key], analyse_text=True
    ).analyse_image()

## Convert to dataframe and write csv

In [None]:
outdict = mutils.append_data_to_dict(mydict)
df = mutils.dump_df(outdict)

In [None]:
# check the dataframe
df.head(10)

In [None]:
# Write the csv
df.to_csv("./data_out.csv")