# Notebook for text extraction on image
Inga Ulusoy, SSC, July 2022

In [None]:
import os
from IPython.display import Image, display
import misinformation

In [None]:
images = misinformation.find_files(limit=1000)

In [None]:
for i in images:
 display(Image(filename=i))

In [None]:
# start with only English
mysubfiles = [i for i in images if "eng" in i]

In [None]:
for i in mysubfiles:
 display(Image(filename=i))

# Pre-process the images: Convert to greyscale and increase contrast

In [None]:
import cv2
from matplotlib import pyplot as plt
import numpy as np

In [None]:
! pip install matplotlib
! pip install numpy

In [None]:
def preprocess(filename):
 """Preprocess the image to enhance features for extraction."""
 image = cv2.imread(filename)
 # preserve the original image
 original = image.copy()
 # Grayscale, Gaussian blur, Otsu's threshold
 gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 # sharpen contrast by first smoothing and then substracting the smoothed and thresholded version
 sharpened = unsharp_mask(gray, amount=1.1, threshold=0.1)
 inverted = invert_image(sharpened)
 return gray, sharpened, inverted


# use unsharp mask algorithm from opencv
# https://docs.opencv.org/4.x/d1/d10/classcv_1_1MatExpr.html#details
def unsharp_mask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):
 """Return a sharpened version of the image, using an unsharp mask.
 Amount: 1 is neutral, higher values result in shaprer images. threshold is the value below which the difference between blurred and original image gets discarded."""
 blurred = cv2.GaussianBlur(image, kernel_size, sigma)
 sharpened = float(amount + 1) * image - float(amount) * blurred
 sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))
 sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))
 sharpened = sharpened.round().astype(np.uint8)
 if threshold > 0:
 low_contrast_mask = np.absolute(image - blurred) < threshold
 np.copyto(sharpened, image, where=low_contrast_mask)
 return sharpened


def invert_image(image):
 return cv2.bitwise_not(image)

In [None]:
grey_image = []
for i in mysubfiles:
 grey_image.append(preprocess(i))

In [None]:
for image in grey_image:
 # disable default colormap in imshow
 plt.imshow(image[0], cmap="gray", vmin=0, vmax=255)
 plt.imshow(image[1], cmap="gray", vmin=0, vmax=255)
 plt.show()

mabe further preprocess in cropping out text regions..?

# Try out different libraries
## The standard go-to tool that is slightly complicated: pytesseract
Install tesseract and the language libraries:
```
sudo apt install tesseract-ocr 
sudo apt install tesseract-ocr-all 
sudo apt install imagemagick 
``` 

In [None]:
from pytesseract import pytesseract

pytesseract.tesseract_cmd = r"tesseract"

In [None]:
! pip install pytesseract

In [None]:
myimage = grey_image[1]
plt.imshow(myimage[0], cmap="gray", vmin=0, vmax=255)
plt.show()

plt.imshow(myimage[1], cmap="gray", vmin=0, vmax=255)
plt.show()

plt.imshow(myimage[2], cmap="gray", vmin=0, vmax=255)
plt.show()

text = pytesseract.image_to_string(myimage[0])
print(text)
text = pytesseract.image_to_string(myimage[1])
print(text)
text = pytesseract.image_to_string(myimage[2])
print(text)

Here we probably would need to compare extractopm from different preprocessed images and overlay in a final text.

In [None]:
for image in mysubfiles:
 # Loading image using OpenCV
 img = cv2.imread(image)

 # Preprocessing image
 # Converting to grayscale
 gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

 # creating Binary image by selecting proper threshold
 binary_image = cv2.threshold(
 gray_image, 130, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
 )[1]

 # Inverting the image
 inverted_bin = cv2.bitwise_not(binary_image)

 # Some noise reduction
 kernel = np.ones((2, 2), np.uint8)
 processed_img = cv2.erode(inverted_bin, kernel, iterations=1)
 processed_img = cv2.dilate(processed_img, kernel, iterations=1)

 # Applying image_to_string method
 text = pytesseract.image_to_string(processed_img)
 plt.imshow(processed_img, cmap="gray", vmin=0, vmax=255)
 plt.show()
 print(text)

## keras-ocr
Not sure how to create an image object without a url.
https://keras-ocr.readthedocs.io/en/latest/examples/using_pretrained_models.html

In [None]:
import keras_ocr

pipeline = keras_ocr.pipeline.Pipeline()

In [None]:
! pip install keras-ocr

In [None]:
images = [
 keras_ocr.tools.read(url)
 for url in [
 "https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-1.jpg",
 "https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-2.png",
 ]
]

In [None]:
prediction_groups = pipeline.recognize(images)

In [None]:
predicted_image_1 = prediction_groups[0]
for text, box in predicted_image_1:
 print(text)

## google cloud vision API
First 1000 images per month are free.

In [None]:
os.environ[
 "GOOGLE_APPLICATION_CREDENTIALS"
] = "/home/inga/projects/misinformation-project/misinformation-notes/seismic-bonfire-329406-412821a70264.json"
images = mysubfiles[1:5]
misinformation.explore_analysis(images, identify="text-on-image")

## MS Azure
https://docs.microsoft.com/en-us/azure/search/cognitive-search-concept-image-scenarios