Text on image 1 (#4)

* read image into nb * test * added keras-ocr and google vision * google cloud vision by far the best * setting up docker for text 1 * move widgets and analysis to display module * move widgets and analysis to display module - 2 * text on image through widgets
2025-10-29 13:06:04 +02:00 · 2022-07-27 22:41:13 +02:00 · 2022-07-27 22:41:13 +02:00 · 446da693e3
--- a/.gitignore
+++ b/.gitignore
@ -129,3 +129,6 @@ dmypy.json
 # Pyre type checker
 .pyre/
 # data folder
 data/
--- a/2
+++ b/2
@ -2,7 +2,7 @@ FROM jupyter/base-notebook:2022-06-06
 # Install system dependencies for computer vision packages
 USER root
-RUN apt update && apt install -y libgl1 libglib2.0-0 libsm6 libxrender1 libxext6
+RUN apt update && apt install -y gcc libgl1 libglib2.0-0 libsm6 libxrender1 libxext6 tesseract-ocr
 USER $NB_USER
 # Copy the repository into the container
--- a/misinformation/init.py
+++ b/misinformation/init.py
@ -5,5 +5,5 @@ from importlib import metadata
 __version__ = metadata.version(__package__)
 del metadata
-from misinformation.faces import explore_face_recognition
+from misinformation.display import explore_analysis
 from misinformation.utils import find_files
--- a/misinformation/display.py
+++ b/misinformation/display.py
@ -0,0 +1,61 @@
 import ipywidgets
 from IPython.display import display
 import misinformation.faces as faces
 import misinformation.text as text
 class JSONContainer:
    """Expose a Python dictionary as a JSON document in JupyterLab
    rich display rendering.
    """
    def __init__(self, data={}):
        self._data = data
    def _repr_json_(self):
        return self._data
 def explore_analysis(image_paths, identify="faces"):
    # dictionary mapping the type of analysis to be explored
    identify_dict = {
        "faces": faces.facial_expression_analysis,
        "text-on-image": text.detect_text,
    }
    # Create an image selector widget
    image_select = ipywidgets.Select(
        options=image_paths, layout=ipywidgets.Layout(width="20%"), rows=20
    )
    # Set up the facial recognition output widget
    output = ipywidgets.Output(layout=ipywidgets.Layout(width="30%"))
    # Set up the image selection and display widget
    image_widget = ipywidgets.Box(
        children=[],
        layout=ipywidgets.Layout(width="50%"),
    )
    # Register the tab switch logic
    def switch(_):
        # Clear existing output
        image_widget.children = ()
        output.clear_output()
        # Create the new content
        image_widget.children = (ipywidgets.Image.from_file(image_select.value),)
        # This output widget absorbes print statements that are messing with
        # the widget output and cannot be disabled through the API.
        with faces.NocatchOutput():
            analysis = identify_dict[identify](image_select.value)
        with output:
            display(JSONContainer(analysis))
    # Register the handler and trigger it immediately
    image_select.observe(switch, names=("value",), type="change")
    switch(None)
    # Show the combined widget
    return ipywidgets.HBox([image_select, image_widget, output])
--- a/misinformation/faces.py
+++ b/misinformation/faces.py
@ -1,13 +1,12 @@
 import cv2
 import ipywidgets
 import numpy as np
 import os
 import pathlib
 import ipywidgets
 from tensorflow.keras.models import load_model
 from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
 from tensorflow.keras.preprocessing.image import img_to_array
 from IPython.display import display
 from deepface import DeepFace
 from retinaface import RetinaFace
@ -148,18 +147,6 @@ def wears_mask(face):
    return bool(mask > withoutMask)
 class JSONContainer:
    """Expose a Python dictionary as a JSON document in JupyterLab
    rich display rendering.
    """
    def __init__(self, data={}):
        self._data = data
    def _repr_json_(self):
        return self._data
 class NocatchOutput(ipywidgets.Output):
    """An output container that suppresses output, but not exceptions
@ -168,42 +155,3 @@ class NocatchOutput(ipywidgets.Output):
    def __exit__(self, *args, **kwargs):
        super().__exit__(*args, **kwargs)
 def explore_face_recognition(image_paths):
    # Create an image selector widget
    image_select = ipywidgets.Select(
        options=image_paths, layout=ipywidgets.Layout(width="20%"), rows=20
    )
    # Set up the facial recognition output widget
    output = NocatchOutput(layout=ipywidgets.Layout(width="30%"))
    # Set up the image selection and display widget
    image_widget = ipywidgets.Box(
        children=[],
        layout=ipywidgets.Layout(width="50%"),
    )
    # Register the tab switch logic
    def switch(_):
        # Clear existing output
        image_widget.children = ()
        output.clear_output()
        # Create the new content
        image_widget.children = (ipywidgets.Image.from_file(image_select.value),)
        # This output widget absorbes print statements that are messing with
        # the widget output and cannot be disabled through the API.
        with NocatchOutput():
            analysis = facial_expression_analysis(image_select.value)
        with output:
            display(JSONContainer(analysis))
    # Register the handler and trigger it immediately
    image_select.observe(switch, names=("value",), type="change")
    switch(None)
    # Show the combined widget
    return ipywidgets.HBox([image_select, image_widget, output])
--- a/misinformation/text.py
+++ b/misinformation/text.py
@ -0,0 +1,26 @@
 from google.cloud import vision
 import io
 def detect_text(path):
    """Detects text in the file."""
    client = vision.ImageAnnotatorClient()
    with io.open(path, "rb") as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    response = client.text_detection(image=image)
    texts = response.text_annotations
    result = {"text": []}
    for text in texts:
        result["text"].append(text.description)
    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )
    return result
--- a/notebooks/facial_expressions.ipynb
+++ b/notebooks/facial_expressions.ipynb
@ -28,7 +28,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "949d9f00-b129-477a-bc1d-e68fed73af2d",
+   "id": "a2bd2153",
   "metadata": {},
   "source": [
    "We select a subset of image files to try facial expression detection on. The `find_files` function finds image files within a given directory:"
@ -77,8 +77,16 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "misinformation.explore_face_recognition(images)"
+    "misinformation.explore_analysis(images, identify=\"faces\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b99f3f1d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
@ -97,7 +105,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.5"
+   "version": "3.10.4"
  }
 },
 "nbformat": 4,
--- a/notebooks/get-text-from-image.ipynb
+++ b/notebooks/get-text-from-image.ipynb
@ -0,0 +1,414 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dcaa3da1",
   "metadata": {},
   "source": [
    "# Notebook for text extraction on image\n",
    "Inga Ulusoy, SSC, July 2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf362e60",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from IPython.display import Image, display\n",
    "import misinformation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6da3a7aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "images = misinformation.find_files(limit=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf811ce0",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in images:\n",
    "    display(Image(filename=i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "366e2060",
   "metadata": {},
   "outputs": [],
   "source": [
    "# start with only English\n",
    "mysubfiles = [i for i in images if \"eng\" in i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b330b267",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in mysubfiles:\n",
    "    display(Image(filename=i))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07b7a7a3",
   "metadata": {},
   "source": [
    "# Pre-process the images: Convert to greyscale and increase contrast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cacfb0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import cv2\n",
    "from matplotlib import pyplot as plt\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c94b02cb-9e96-4812-8448-8bc731bfd8aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install matplotlib\n",
    "! pip install numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a2d3057",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess(filename):\n",
    "    \"\"\"Preprocess the image to enhance features for extraction.\"\"\"\n",
    "    image = cv2.imread(filename)\n",
    "    # preserve the original image\n",
    "    original = image.copy()\n",
    "    # Grayscale, Gaussian blur, Otsu's threshold\n",
    "    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n",
    "    # sharpen contrast by first smoothing and then substracting the smoothed and thresholded version\n",
    "    sharpened = unsharp_mask(gray, amount=1.1, threshold=0.1)\n",
    "    inverted = invert_image(sharpened)\n",
    "    return gray, sharpened, inverted\n",
    "\n",
    "\n",
    "# use unsharp mask algorithm from opencv\n",
    "# https://docs.opencv.org/4.x/d1/d10/classcv_1_1MatExpr.html#details\n",
    "def unsharp_mask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):\n",
    "    \"\"\"Return a sharpened version of the image, using an unsharp mask.\n",
    "    Amount: 1 is neutral, higher values result in shaprer images. threshold is the value below which the difference between blurred and original image gets discarded.\"\"\"\n",
    "    blurred = cv2.GaussianBlur(image, kernel_size, sigma)\n",
    "    sharpened = float(amount + 1) * image - float(amount) * blurred\n",
    "    sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))\n",
    "    sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))\n",
    "    sharpened = sharpened.round().astype(np.uint8)\n",
    "    if threshold > 0:\n",
    "        low_contrast_mask = np.absolute(image - blurred) < threshold\n",
    "        np.copyto(sharpened, image, where=low_contrast_mask)\n",
    "    return sharpened\n",
    "\n",
    "\n",
    "def invert_image(image):\n",
    "    return cv2.bitwise_not(image)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a483868b",
   "metadata": {},
   "outputs": [],
   "source": [
    "grey_image = []\n",
    "for i in mysubfiles:\n",
    "    grey_image.append(preprocess(i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08ed750d",
   "metadata": {},
   "outputs": [],
   "source": [
    "for image in grey_image:\n",
    "    # disable default colormap in imshow\n",
    "    plt.imshow(image[0], cmap=\"gray\", vmin=0, vmax=255)\n",
    "    plt.imshow(image[1], cmap=\"gray\", vmin=0, vmax=255)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71ac2229",
   "metadata": {},
   "source": [
    "mabe further preprocess in cropping out text regions..?"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7786d09c",
   "metadata": {},
   "source": [
    "# Try out different libraries\n",
    "## The standard go-to tool that is slightly complicated: pytesseract\n",
    "Install tesseract and the language libraries:\n",
    "```\n",
    "sudo apt install tesseract-ocr  \n",
    "sudo apt install tesseract-ocr-all  \n",
    "sudo apt install imagemagick  \n",
    "```  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d69504c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pytesseract import pytesseract\n",
    "\n",
    "pytesseract.tesseract_cmd = r\"tesseract\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b27c98c-b437-4c8b-8844-96d8718eea49",
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install pytesseract"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "529de8d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "myimage = grey_image[1]\n",
    "plt.imshow(myimage[0], cmap=\"gray\", vmin=0, vmax=255)\n",
    "plt.show()\n",
    "\n",
    "plt.imshow(myimage[1], cmap=\"gray\", vmin=0, vmax=255)\n",
    "plt.show()\n",
    "\n",
    "plt.imshow(myimage[2], cmap=\"gray\", vmin=0, vmax=255)\n",
    "plt.show()\n",
    "\n",
    "text = pytesseract.image_to_string(myimage[0])\n",
    "print(text)\n",
    "text = pytesseract.image_to_string(myimage[1])\n",
    "print(text)\n",
    "text = pytesseract.image_to_string(myimage[2])\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e25dd39e",
   "metadata": {},
   "source": [
    "Here we probably would need to compare extractopm from different preprocessed images and overlay in a final text."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36495f3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "for image in mysubfiles:\n",
    "    # Loading image using OpenCV\n",
    "    img = cv2.imread(image)\n",
    "\n",
    "    # Preprocessing image\n",
    "    # Converting to grayscale\n",
    "    gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
    "\n",
    "    # creating Binary image by selecting proper threshold\n",
    "    binary_image = cv2.threshold(\n",
    "        gray_image, 130, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU\n",
    "    )[1]\n",
    "\n",
    "    # Inverting the image\n",
    "    inverted_bin = cv2.bitwise_not(binary_image)\n",
    "\n",
    "    # Some noise reduction\n",
    "    kernel = np.ones((2, 2), np.uint8)\n",
    "    processed_img = cv2.erode(inverted_bin, kernel, iterations=1)\n",
    "    processed_img = cv2.dilate(processed_img, kernel, iterations=1)\n",
    "\n",
    "    # Applying image_to_string method\n",
    "    text = pytesseract.image_to_string(processed_img)\n",
    "    plt.imshow(processed_img, cmap=\"gray\", vmin=0, vmax=255)\n",
    "    plt.show()\n",
    "    print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6532019",
   "metadata": {},
   "source": [
    "## keras-ocr\n",
    "Not sure how to create an image object without a url.\n",
    "https://keras-ocr.readthedocs.io/en/latest/examples/using_pretrained_models.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44e38871",
   "metadata": {},
   "outputs": [],
   "source": [
    "import keras_ocr\n",
    "\n",
    "pipeline = keras_ocr.pipeline.Pipeline()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2bb55068-ddd4-4b90-ae94-90181980d3c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install keras-ocr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0002f2c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "images = [\n",
    "    keras_ocr.tools.read(url)\n",
    "    for url in [\n",
    "        \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-1.jpg\",\n",
    "        \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-2.png\",\n",
    "    ]\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1567dc85",
   "metadata": {},
   "outputs": [],
   "source": [
    "prediction_groups = pipeline.recognize(images)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb1ca152",
   "metadata": {},
   "outputs": [],
   "source": [
    "predicted_image_1 = prediction_groups[0]\n",
    "for text, box in predicted_image_1:\n",
    "    print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b8b929f",
   "metadata": {},
   "source": [
    "## google cloud vision API\n",
    "First 1000 images per month are free."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c6ecc88",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\n",
    "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
    "] = \"/home/inga/projects/misinformation-project/misinformation-notes/seismic-bonfire-329406-412821a70264.json\"\n",
    "images = mysubfiles[1:5]\n",
    "misinformation.explore_analysis(images, identify=\"text-on-image\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d54407ad",
   "metadata": {},
   "source": [
    "## MS Azure\n",
    "https://docs.microsoft.com/en-us/azure/search/cognitive-search-concept-image-scenarios"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  },
  "vscode": {
   "interpreter": {
    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@ -25,6 +25,7 @@ dependencies = [
    "ipywidgets ==8.0.0rc1",
    "pooch",
    "retina-face",
    "google-cloud-vision",
 ]
 [project.scripts]
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,10 @@
 deepface
 ipywidgets==8.0.0rc1
 pooch
 retina-face
 opencv-python
 matplotlib
 numpy
 keras-ocr
 tensorflow
 google-cloud-vision