AMMICO/notebooks/get-text-from-image.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dcaa3da1",
   "metadata": {},
   "source": [
    "# Notebook for text extraction on image\n",
    "Inga Ulusoy, SSC, July 2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf362e60",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from IPython.display import Image, display\n",
    "import misinformation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6da3a7aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "images = misinformation.find_files(limit=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf811ce0",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in images:\n",
    "    display(Image(filename=i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "366e2060",
   "metadata": {},
   "outputs": [],
   "source": [
    "# start with only English\n",
    "mysubfiles = [i for i in images if \"eng\" in i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b330b267",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in mysubfiles:\n",
    "    display(Image(filename=i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b32409f",
   "metadata": {},
   "outputs": [],
   "source": [
    "mydict = misinformation.utils.initialize_dict(mysubfiles)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07b7a7a3",
   "metadata": {},
   "source": [
    "# Pre-process the images: Convert to greyscale and increase contrast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cacfb0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import cv2\n",
    "from matplotlib import pyplot as plt\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c94b02cb-9e96-4812-8448-8bc731bfd8aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install matplotlib\n",
    "! pip install numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a2d3057",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess(filename):\n",
    "    \"\"\"Preprocess the image to enhance features for extraction.\"\"\"\n",
    "    image = cv2.imread(filename)\n",
    "    # preserve the original image\n",
    "    #     original = image.copy()\n",
    "    # Grayscale, Gaussian blur, Otsu's threshold\n",
    "    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n",
    "    # sharpen contrast by first smoothing and then substracting the smoothed and thresholded version\n",
    "    sharpened = unsharp_mask(gray, amount=1.1, threshold=0.1)\n",
    "    inverted = invert_image(sharpened)\n",
    "    return gray, sharpened, inverted\n",
    "\n",
    "\n",
    "# use unsharp mask algorithm from opencv\n",
    "# https://docs.opencv.org/4.x/d1/d10/classcv_1_1MatExpr.html#details\n",
    "def unsharp_mask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):\n",
    "    \"\"\"Return a sharpened version of the image, using an unsharp mask.\n",
    "    Amount: 1 is neutral, higher values result in shaprer images. threshold is the value below which the difference between blurred and original image gets discarded.\"\"\"\n",
    "    blurred = cv2.GaussianBlur(image, kernel_size, sigma)\n",
    "    sharpened = float(amount + 1) * image - float(amount) * blurred\n",
    "    sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))\n",
    "    sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))\n",
    "    sharpened = sharpened.round().astype(np.uint8)\n",
    "    if threshold > 0:\n",
    "        low_contrast_mask = np.absolute(image - blurred) < threshold\n",
    "        np.copyto(sharpened, image, where=low_contrast_mask)\n",
    "    return sharpened\n",
    "\n",
    "\n",
    "def invert_image(image):\n",
    "    return cv2.bitwise_not(image)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a483868b",
   "metadata": {},
   "outputs": [],
   "source": [
    "grey_image = []\n",
    "for i in mysubfiles:\n",
    "    grey_image.append(preprocess(i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08ed750d",
   "metadata": {},
   "outputs": [],
   "source": [
    "for image in grey_image:\n",
    "    # disable default colormap in imshow\n",
    "    plt.imshow(image[0], cmap=\"gray\", vmin=0, vmax=255)\n",
    "    plt.imshow(image[1], cmap=\"gray\", vmin=0, vmax=255)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71ac2229",
   "metadata": {},
   "source": [
    "mabe further preprocess in cropping out text regions..?"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7786d09c",
   "metadata": {},
   "source": [
    "# Try out different libraries\n",
    "## The standard go-to tool that is slightly complicated: pytesseract\n",
    "Install tesseract and the language libraries:\n",
    "```\n",
    "sudo apt install tesseract-ocr  \n",
    "sudo apt install tesseract-ocr-all  \n",
    "sudo apt install imagemagick  \n",
    "```  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d69504c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pytesseract import pytesseract\n",
    "\n",
    "pytesseract.tesseract_cmd = r\"tesseract\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b27c98c-b437-4c8b-8844-96d8718eea49",
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install pytesseract"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "529de8d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "myimage = grey_image[1]\n",
    "plt.imshow(myimage[0], cmap=\"gray\", vmin=0, vmax=255)\n",
    "plt.show()\n",
    "\n",
    "plt.imshow(myimage[1], cmap=\"gray\", vmin=0, vmax=255)\n",
    "plt.show()\n",
    "\n",
    "plt.imshow(myimage[2], cmap=\"gray\", vmin=0, vmax=255)\n",
    "plt.show()\n",
    "\n",
    "text = pytesseract.image_to_string(myimage[0])\n",
    "print(text)\n",
    "text = pytesseract.image_to_string(myimage[1])\n",
    "print(text)\n",
    "text = pytesseract.image_to_string(myimage[2])\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e25dd39e",
   "metadata": {},
   "source": [
    "Here we probably would need to compare extractopm from different preprocessed images and overlay in a final text."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36495f3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "for image in mysubfiles:\n",
    "    # Loading image using OpenCV\n",
    "    img = cv2.imread(image)\n",
    "\n",
    "    # Preprocessing image\n",
    "    # Converting to grayscale\n",
    "    gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
    "\n",
    "    # creating Binary image by selecting proper threshold\n",
    "    binary_image = cv2.threshold(\n",
    "        gray_image, 130, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU\n",
    "    )[1]\n",
    "\n",
    "    # Inverting the image\n",
    "    inverted_bin = cv2.bitwise_not(binary_image)\n",
    "\n",
    "    # Some noise reduction\n",
    "    kernel = np.ones((2, 2), np.uint8)\n",
    "    processed_img = cv2.erode(inverted_bin, kernel, iterations=1)\n",
    "    processed_img = cv2.dilate(processed_img, kernel, iterations=1)\n",
    "\n",
    "    # Applying image_to_string method\n",
    "    text = pytesseract.image_to_string(processed_img)\n",
    "    plt.imshow(processed_img, cmap=\"gray\", vmin=0, vmax=255)\n",
    "    plt.show()\n",
    "    print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6532019",
   "metadata": {},
   "source": [
    "## keras-ocr\n",
    "Not sure how to create an image object without a url.\n",
    "https://keras-ocr.readthedocs.io/en/latest/examples/using_pretrained_models.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44e38871",
   "metadata": {},
   "outputs": [],
   "source": [
    "import keras_ocr\n",
    "\n",
    "pipeline = keras_ocr.pipeline.Pipeline()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2bb55068-ddd4-4b90-ae94-90181980d3c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install keras-ocr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0002f2c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "images = [\n",
    "    keras_ocr.tools.read(url)\n",
    "    for url in [\n",
    "        \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-1.jpg\",\n",
    "        \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-2.png\",\n",
    "    ]\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1567dc85",
   "metadata": {},
   "outputs": [],
   "source": [
    "prediction_groups = pipeline.recognize(images)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb1ca152",
   "metadata": {},
   "outputs": [],
   "source": [
    "predicted_image_1 = prediction_groups[0]\n",
    "for text, box in predicted_image_1:\n",
    "    print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b8b929f",
   "metadata": {},
   "source": [
    "## google cloud vision API\n",
    "First 1000 images per month are free."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c6ecc88",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\n",
    "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
    "] = \"/home/inga/projects/misinformation-project/misinformation-notes/seismic-bonfire-329406-412821a70264.json\"\n",
    "misinformation.explore_analysis(mydict, identify=\"text-on-image\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d54407ad",
   "metadata": {},
   "source": [
    "## MS Azure\n",
    "https://docs.microsoft.com/en-us/azure/search/cognitive-search-concept-image-scenarios"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  },
  "vscode": {
   "interpreter": {
    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}